Background

This file is designed to analyze coronavirus data from a single state using three data sources:

Code for processing data from each of these sources is available in:

The goal of this file is to download updated data for the three main data sources, and to explore the performance of the segments as measured against the new data.

Functions are sourced and a variable mapping file is created:

# All functions assume that tidyverse and its components are loaded and available
# Other functions are declared in the sourcing files or use library::function()
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.2     v purrr   0.3.4
## v tibble  3.0.4     v dplyr   1.0.2
## v tidyr   1.1.2     v stringr 1.4.0
## v readr   1.4.0     v forcats 0.5.0
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
# If the same function is in both files, use the version from the more specific source
source("./Coronavirus_Statistics_Functions_Shared_v003.R")
source("./Coronavirus_Statistics_Functions_CTP_v003.R")
source("./Coronavirus_Statistics_Functions_USAF_v003.R")
source("./Coronavirus_Statistics_Functions_CDC_v003.R")

# Create a variable mapping file
varMapper <- c("cases"="Cases", 
               "newCases"="Increase in cases, most recent 30 days",
               "casesroll7"="Rolling 7-day mean cases", 
               "deaths"="Deaths", 
               "newDeaths"="Increase in deaths, most recent 30 days",
               "deathsroll7"="Rolling 7-day mean deaths", 
               "cpm"="Cases per million",
               "cpm7"="Cases per day (7-day rolling mean) per million", 
               "newcpm"="Increase in cases, most recent 30 days, per million",
               "dpm"="Deaths per million", 
               "dpm7"="Deaths per day (7-day rolling mean) per million", 
               "newdpm"="Increase in deaths, most recent 30 days, per million", 
               "hpm7"="Currently Hospitalized per million (7-day rolling mean)", 
               "tpm"="Tests per million", 
               "tpm7"="Tests per million per day (7-day rolling mean)", 
               "cdcExcess"="Excess all-cause (CDC)", 
               "ctp_death7"="COVID Tracking Project", 
               "usaf_death7"="USA Facts",
               "CDC_deaths"="CDC total deaths",
               "CDC_excess"="CDC excess deaths",
               "CTP_cases"="COVID Tracking Project cases",
               "CTP_deaths"="COVID Tracking Project deaths",
               "CTP_hosp"="COVID Tracking Project hospitalized",
               "CTP_tests"="COVID Tracking Project tests",
               "USAF_cases"="USA Facts cases", 
               "USAF_deaths"="USA Facts deaths",
               "vpm7"="Per million people (7-day rolling daily average)",
               "vpm"="Per million people"
               )

Data Updates (early November)

New data from COVID Tracking Project are downloaded and assessed against the existing state-level segments:

# Use existing segments with updated data
locDownload <- "./RInputFiles/Coronavirus/CV_downloaded_201108.csv"
test_old_201108 <- readRunCOVIDTrackingProject(thruLabel="Nov 7, 2020", 
                                               downloadTo=if (file.exists(locDownload)) NULL else locDownload,
                                               readFrom=locDownload, 
                                               compareFile=readFromRDS("test_hier5_201025")$dfRaw,
                                               useClusters=readFromRDS("test_hier5_201025")$useClusters
                                               )
## 
## -- Column specification --------------------------------------------------------
## cols(
##   .default = col_double(),
##   state = col_character(),
##   totalTestResultsSource = col_character(),
##   dataQualityGrade = col_character(),
##   lastUpdateEt = col_character(),
##   dateModified = col_datetime(format = ""),
##   checkTimeEt = col_character(),
##   dateChecked = col_datetime(format = ""),
##   fips = col_character(),
##   hash = col_character(),
##   grade = col_logical()
## )
## i Use `spec()` for the full column specifications.
## Warning: 7520 parsing failures.
## row          col   expected               actual                                                 file
##   3 dateModified valid date 2020-11-07T24:00:00Z './RInputFiles/Coronavirus/CV_downloaded_201108.csv'
##   3 dateChecked  valid date 2020-11-07T24:00:00Z './RInputFiles/Coronavirus/CV_downloaded_201108.csv'
##   4 dateModified valid date 2020-11-01T24:00:00Z './RInputFiles/Coronavirus/CV_downloaded_201108.csv'
##   4 dateChecked  valid date 2020-11-01T24:00:00Z './RInputFiles/Coronavirus/CV_downloaded_201108.csv'
##   5 dateModified valid date 2020-11-07T24:00:00Z './RInputFiles/Coronavirus/CV_downloaded_201108.csv'
## ... ............ .......... .................... ....................................................
## See problems(...) for more details.
## 
## File is unique by state and date
## 
## 
## Overall control totals in file:
## # A tibble: 1 x 3
##   positiveIncrease deathIncrease hospitalizedCurrently
##              <dbl>         <dbl>                 <dbl>
## 1          9761373        229238               9362912
## 
## *** COMPARISONS TO REFERENCE FILE: compareFile
## 
## Checkin for similarity of: column names
## In reference but not in current: 
## In current but not in reference: 
## 
## Checkin for similarity of: states
## In reference but not in current: 
## In current but not in reference: 
## 
## Checkin for similarity of: dates
## In reference but not in current: 
## In current but not in reference: 2020-11-07 2020-11-06 2020-11-05 2020-11-04 2020-11-03 2020-11-02 2020-11-01 2020-10-31 2020-10-30 2020-10-29 2020-10-28 2020-10-27 2020-10-26 2020-10-25
## 
## *** Difference of at least 5 and difference is at least 1%:
## Joining, by = c("date", "name")
##          date             name newValue oldValue
## 1  2020-03-07 positiveIncrease      171      176
## 2  2020-03-11 positiveIncrease      502      509
## 3  2020-03-13 positiveIncrease     1059     1072
## 4  2020-03-18 positiveIncrease     3023     3089
## 5  2020-03-26 positiveIncrease    17544    17720
## 6  2020-03-28 positiveIncrease    19586    19925
## 7  2020-03-29 positiveIncrease    19570    19348
## 8  2020-03-30 positiveIncrease    21691    22042
## 9  2020-04-01 positiveIncrease    26078    25791
## 10 2020-04-06 positiveIncrease    28592    29002
## 11 2020-04-13 positiveIncrease    24758    25195
## 12 2020-04-15 positiveIncrease    29755    30307
## 13 2020-04-16 positiveIncrease    31489    30978
## 14 2020-04-24 positiveIncrease    33698    34231
## 15 2020-05-12 positiveIncrease    22520    22890
## 16 2020-05-13 positiveIncrease    21577    21285
## 17 2020-05-15 positiveIncrease    25371    24685
## 18 2020-05-16 positiveIncrease    23560    24702
## 19 2020-05-17 positiveIncrease    20344    20009
## 20 2020-05-18 positiveIncrease    20812    21028
## 21 2020-05-23 positiveIncrease    22167    21531
## 22 2020-05-24 positiveIncrease    19148    20072
## 23 2020-05-30 positiveIncrease    23443    23682
## 24 2020-06-04 positiveIncrease    20256    20886
## 25 2020-06-05 positiveIncrease    23004    23394
## 26 2020-06-06 positiveIncrease    22773    23064
## 27 2020-06-10 positiveIncrease    20637    20894
## 28 2020-06-12 positiveIncrease    23185    23597
## 29 2020-06-18 positiveIncrease    27135    27746
## 30 2020-06-19 positiveIncrease    30881    31471
## 31 2020-06-21 positiveIncrease    28991    27928
## 32 2020-06-23 positiveIncrease    33848    33447
## 33 2020-07-02 positiveIncrease    53385    54085
## 34 2020-07-06 positiveIncrease    41416    41959
## 35 2020-08-01 positiveIncrease    60416    61101
## 36 2020-08-08 positiveIncrease    53158    53712
## 37 2020-08-14 positiveIncrease    57254    55636
## 38 2020-08-22 positiveIncrease    45722    46236
## 39 2020-09-02 positiveIncrease    30287    30603
## 40 2020-09-07 positiveIncrease    28237    28682
## 41 2020-09-15 positiveIncrease    34879    35445
## 42 2020-09-19 positiveIncrease    44886    45564
## 43 2020-09-20 positiveIncrease    35688    36295
## 44 2020-09-21 positiveIncrease    39062    39472
## 45 2020-09-24 positiveIncrease    43243    43772
## 46 2020-09-27 positiveIncrease    35061    35454
## 47 2020-09-28 positiveIncrease    36056    36524
## 48 2020-09-29 positiveIncrease    36289    36947
## 49 2020-10-21 positiveIncrease    60657    58606
## 50 2020-10-22 positiveIncrease    72887    75248
## Joining, by = c("date", "name")
## Warning: Removed 14 row(s) containing missing values (geom_path).
## 
## 
## *** Difference of at least 5 and difference is at least 1%:
## Joining, by = c("state", "name")
##   state             name newValue oldValue
## 1    FL positiveIncrease   766305   776249
## 2    PR positiveIncrease    31067    61275
## 3    RI positiveIncrease    30581    30116
## Rows: 13,999
## Columns: 55
## $ date                        <date> 2020-11-07, 2020-11-07, 2020-11-07, 20...
## $ state                       <chr> "AK", "AL", "AR", "AS", "AZ", "CA", "CO...
## $ positive                    <dbl> 19306, 202482, 120828, 0, 257384, 95695...
## $ probableCases               <dbl> NA, 30709, 10812, NA, 6590, NA, 7501, 4...
## $ negative                    <dbl> 728589, 1224595, 1308477, 1768, 1608041...
## $ pending                     <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
## $ totalTestResultsSource      <chr> "totalTestsViral", "totalTestsViral", "...
## $ totalTestResults            <dbl> 747288, 1396368, 1418493, 1768, 1858835...
## $ hospitalizedCurrently       <dbl> 105, 1015, 633, NA, 1139, 3456, 1041, 4...
## $ hospitalizedCumulative      <dbl> NA, 21294, 7415, NA, 22170, NA, 9911, 1...
## $ inIcuCurrently              <dbl> NA, NA, 235, NA, 249, 901, NA, NA, 25, ...
## $ inIcuCumulative             <dbl> NA, 2121, NA, NA, NA, NA, NA, NA, NA, N...
## $ onVentilatorCurrently       <dbl> 8, NA, 105, NA, 137, NA, NA, NA, 14, NA...
## $ onVentilatorCumulative      <dbl> NA, 1225, 865, NA, NA, NA, NA, NA, NA, ...
## $ recovered                   <dbl> 7157, 84471, 106594, NA, 42950, NA, 847...
## $ dataQualityGrade            <chr> "A", "A", "A+", "D", "A+", "B", "A", "B...
## $ lastUpdateEt                <chr> "11/7/2020 03:59", "11/7/2020 11:00", "...
## $ dateModified                <dttm> 2020-11-07 03:59:00, 2020-11-07 11:00:...
## $ checkTimeEt                 <chr> "11/06 22:59", "11/07 06:00", "11/06 19...
## $ death                       <dbl> 84, 3082, 2068, 0, 6147, 17939, 2168, 4...
## $ hospitalized                <dbl> NA, 21294, 7415, NA, 22170, NA, 9911, 1...
## $ dateChecked                 <dttm> 2020-11-07 03:59:00, 2020-11-07 11:00:...
## $ totalTestsViral             <dbl> 747288, 1396368, 1418493, 1768, NA, 195...
## $ positiveTestsViral          <dbl> 21064, NA, NA, NA, NA, NA, NA, NA, NA, ...
## $ negativeTestsViral          <dbl> 725850, NA, 1308477, NA, NA, NA, NA, NA...
## $ positiveCasesViral          <dbl> 19306, 171773, 110016, 0, 250794, 95695...
## $ deathConfirmed              <dbl> 84, 2864, 1890, NA, 5730, NA, NA, 3757,...
## $ deathProbable               <dbl> NA, 218, 178, NA, 417, NA, NA, 914, NA,...
## $ totalTestEncountersViral    <dbl> NA, NA, NA, NA, NA, NA, 2177091, NA, 55...
## $ totalTestsPeopleViral       <dbl> NA, NA, NA, NA, 1858835, NA, 1303345, N...
## $ totalTestsAntibody          <dbl> NA, NA, NA, NA, 324293, NA, 187631, NA,...
## $ positiveTestsAntibody       <dbl> NA, NA, NA, NA, NA, NA, 14007, NA, NA, ...
## $ negativeTestsAntibody       <dbl> NA, NA, NA, NA, NA, NA, 173624, NA, NA,...
## $ totalTestsPeopleAntibody    <dbl> NA, 66104, NA, NA, NA, NA, NA, NA, NA, ...
## $ positiveTestsPeopleAntibody <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
## $ negativeTestsPeopleAntibody <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
## $ totalTestsPeopleAntigen     <dbl> NA, NA, 76277, NA, NA, NA, NA, NA, NA, ...
## $ positiveTestsPeopleAntigen  <dbl> NA, NA, 12392, NA, NA, NA, NA, NA, NA, ...
## $ totalTestsAntigen           <dbl> NA, NA, 21856, NA, NA, NA, NA, 26512, N...
## $ positiveTestsAntigen        <dbl> NA, NA, 3300, NA, NA, NA, NA, NA, NA, N...
## $ fips                        <chr> "02", "01", "05", "60", "04", "06", "08...
## $ positiveIncrease            <dbl> 607, 1768, 1598, 0, 2620, 5863, 3463, 0...
## $ negativeIncrease            <dbl> -34013, 7593, 10213, 0, 14060, 162939, ...
## $ total                       <dbl> 747895, 1427077, 1429305, 1768, 1865425...
## $ totalTestResultsIncrease    <dbl> -34013, 8920, 11491, 0, 16524, 168802, ...
## $ posNeg                      <dbl> 747895, 1427077, 1429305, 1768, 1865425...
## $ deathIncrease               <dbl> 0, 33, 12, 0, 38, 73, 10, 0, 2, 0, 87, ...
## $ hospitalizedIncrease        <dbl> 0, 0, 14, 0, 152, 0, 197, 0, 0, 0, 161,...
## $ hash                        <chr> "f2176e93601204643e0618a661e7c3603f44f4...
## $ commercialScore             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ negativeRegularScore        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ negativeScore               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ positiveScore               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ score                       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ grade                       <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
## 
## 
## Control totals - note that validState other than TRUE will be discarded
## 
## # A tibble: 2 x 6
##   validState   cases deaths  hosp     tests     n
##   <lgl>        <dbl>  <dbl> <dbl>     <dbl> <dbl>
## 1 FALSE        44159    976    NA    457982  1185
## 2 TRUE       9717214 228262    NA 154892254 12814
## Rows: 12,814
## Columns: 6
## $ date   <date> 2020-11-07, 2020-11-07, 2020-11-07, 2020-11-07, 2020-11-07,...
## $ state  <chr> "AK", "AL", "AR", "AZ", "CA", "CO", "CT", "DC", "DE", "FL", ...
## $ cases  <dbl> 607, 1768, 1598, 2620, 5863, 3463, 0, 99, 223, 4380, 1719, 1...
## $ deaths <dbl> 0, 33, 12, 38, 73, 10, 0, 2, 0, 87, 39, 0, 11, 8, 91, 45, 0,...
## $ hosp   <dbl> 105, 1015, 633, 1139, 3456, 1041, 402, 77, 115, 2672, 1859, ...
## $ tests  <dbl> -34013, 8920, 11491, 16524, 168802, 35158, 0, 4262, 5159, 48...
## Rows: 12,814
## Columns: 14
## $ date   <date> 2020-01-22, 2020-01-22, 2020-01-23, 2020-01-23, 2020-01-24,...
## $ state  <chr> "MA", "WA", "MA", "WA", "MA", "WA", "MA", "WA", "MA", "WA", ...
## $ cases  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ deaths <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ hosp   <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
## $ tests  <dbl> 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, ...
## $ cpm    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ dpm    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ hpm    <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
## $ tpm    <dbl> 0.0000000, 0.0000000, 0.1471796, 0.0000000, 0.0000000, 0.000...
## $ cpm7   <dbl> NA, NA, NA, NA, NA, NA, 0, 0, 0, 0, 0, 0, 0, 0, NA, 0, 0, NA...
## $ dpm7   <dbl> NA, NA, NA, NA, NA, NA, 0, 0, 0, 0, 0, 0, 0, 0, NA, 0, 0, NA...
## $ hpm7   <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
## $ tpm7   <dbl> NA, NA, NA, NA, NA, NA, 0.04205130, 0.00000000, 0.06307695, ...
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` regrouping output by 'date', 'cluster' (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)

## 
## Recency is defined as 2020-10-09 through current
## 
## Recency is defined as 2020-10-09 through current

## `summarise()` regrouping output by 'state', 'cluster', 'date' (override with `.groups` argument)

## `summarise()` ungrouping output (override with `.groups` argument)

## `summarise()` ungrouping output (override with `.groups` argument)

## `summarise()` ungrouping output (override with `.groups` argument)

saveToRDS(test_old_201108, ovrWriteError=FALSE)

Cases appear to be spiking in some of the states in the segment that had previously been less impacted. Hospitalizations in aggregate in this segment are starting to slope upwards, though not yet at the same rate as the increase in cases.

New data from USA Facts are downloaded and assessed against the existing county-level segments:

# Locations for the population, case, and death file
popLoc <- "./RInputFiles/Coronavirus/covid_county_population_usafacts.csv"
caseLoc <- "./RInputFiles/Coronavirus/covid_confirmed_usafacts_downloaded_20201109.csv"
deathLoc <- "./RInputFiles/Coronavirus/covid_deaths_usafacts_downloaded_20201109.csv"

# Run old segments against new data
cty_old_20201109 <- readRunUSAFacts(maxDate="2020-11-07", 
                                    popLoc=popLoc, 
                                    caseLoc=caseLoc, 
                                    deathLoc=deathLoc, 
                                    dlCaseDeath=!(file.exists(caseLoc) & file.exists(deathLoc)),
                                    oldFile=readFromRDS("cty_20201026")$dfBurden, 
                                    existingCountyClusters=readFromRDS("cty_20201026")$clustVec
                                    )
## 
## -- Column specification --------------------------------------------------------
## cols(
##   countyFIPS = col_double(),
##   `County Name` = col_character(),
##   State = col_character(),
##   population = col_double()
## )
## 
## -- Column specification --------------------------------------------------------
## cols(
##   .default = col_double(),
##   `County Name` = col_character(),
##   State = col_character()
## )
## i Use `spec()` for the full column specifications.
## Rows: 929,745
## Columns: 6
## $ countyFIPS <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ countyName <chr> "Statewide Unallocated", "Statewide Unallocated", "State...
## $ state      <chr> "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", "A...
## $ stateFIPS  <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...
## $ date       <date> 2020-01-22, 2020-01-23, 2020-01-24, 2020-01-25, 2020-01...
## $ cumCases   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## Warning: `expand_scale()` is deprecated; use `expansion()` instead.
## 
## -- Column specification --------------------------------------------------------
## cols(
##   .default = col_double(),
##   `County Name` = col_character(),
##   State = col_character()
## )
## i Use `spec()` for the full column specifications.
## Warning: 1 parsing failure.
##  row     col               expected actual                                                                      file
## 1366 11/7/20 no trailing characters  1,020 './RInputFiles/Coronavirus/covid_deaths_usafacts_downloaded_20201109.csv'
## Rows: 929,745
## Columns: 6
## $ countyFIPS <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ countyName <chr> "Statewide Unallocated", "Statewide Unallocated", "State...
## $ state      <chr> "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", "A...
## $ stateFIPS  <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...
## $ date       <date> 2020-01-22, 2020-01-23, 2020-01-24, 2020-01-25, 2020-01...
## $ cumDeaths  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## Warning: `expand_scale()` is deprecated; use `expansion()` instead.
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 1 rows containing non-finite values (stat_smooth).
## Warning: Removed 1 rows containing missing values (geom_point).
## `summarise()` ungrouping output (override with `.groups` argument)

## Warning: Removed 2 rows containing missing values (geom_point).
## `summarise()` ungrouping output (override with `.groups` argument)
## 
## Shapes will be created without any floor on the number of cases per million
## Shapes will be created without any floor on the number of deaths per million
## *** Counties with 0 cases/deaths or that fall below the floor for minCase/minDeath ***
## # A tibble: 1 x 4
##   cpm_mean_is0 dpm_mean_is0 dpm_mean_ltDeath cpm_mean_ltCase
##          <dbl>        <dbl>            <dbl>           <dbl>
## 1            0           NA               NA               0
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` regrouping output by 'cluster' (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` regrouping output by 'date', 'cluster' (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)

## Warning: Removed 1 rows containing missing values (position_stack).
## Warning: Removed 1 rows containing missing values (geom_text).

## 
## Recency is defined as 2020-10-09 through current
## 
## Recency is defined as 2020-10-09 through current
## Warning: Removed 2 rows containing missing values (geom_point).
## `summarise()` regrouping output by 'cluster' (override with `.groups` argument)
## `summarise()` regrouping output by 'cluster' (override with `.groups` argument)
## `summarise()` regrouping output by 'cluster' (override with `.groups` argument)
## `summarise()` regrouping output by 'cluster' (override with `.groups` argument)

## Warning: `expand_scale()` is deprecated; use `expansion()` instead.

## Joining, by = "fipsCounty"
## Joining, by = "fipsCounty"
## `summarise()` regrouping output by 'cluster' (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)

saveToRDS(cty_old_20201109, ovrWriteError=FALSE)
## 
## File already exists: ./RInputFiles/Coronavirus/cty_old_20201109.RDS 
## 
## Not replacing the existing file since ovrWrite=FALSE
## NULL

Cases appear to be growing in many of the county clusters, though deaths per million per day remain well below the peaks observed in April in the earlier-hit counties.

Next, all-cause death data from the CDC are downloaded and assessed:

# Use data that have previously been downloaded
cdcLoc <- "Weekly_counts_of_deaths_by_jurisdiction_and_age_group_downloaded_20201110.csv"
cdcList_20201110 <- readRunCDCAllCause(loc=cdcLoc, 
                                       startYear=2015, 
                                       curYear=2020,
                                       weekThru=36, 
                                       startWeek=9, 
                                       lst=readFromRDS("test_old_201108"), 
                                       epiMap=readFromRDS("epiMonth"), 
                                       agePopData=readFromRDS("usPopBucket2020"), 
                                       cvDeathThru="2020-09-05", 
                                       cdcPlotStartWeek=10, 
                                       dlData=!file.exists(paste0("./RInputFiles/Coronavirus/", cdcLoc))
                                       )
## Rows: 178,482
## Columns: 11
## $ Jurisdiction         <chr> "Alabama", "Alabama", "Alabama", "Alabama", "A...
## $ `Week Ending Date`   <chr> "01/10/2015", "01/17/2015", "01/24/2015", "01/...
## $ `State Abbreviation` <chr> "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL"...
## $ Year                 <int> 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015...
## $ Week                 <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,...
## $ `Age Group`          <chr> "25-44 years", "25-44 years", "25-44 years", "...
## $ `Number of Deaths`   <dbl> 67, 49, 55, 59, 47, 59, 41, 47, 59, 57, 54, 50...
## $ `Time Period`        <chr> "2015-2019", "2015-2019", "2015-2019", "2015-2...
## $ Type                 <chr> "Predicted (weighted)", "Predicted (weighted)"...
## $ Suppress             <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## $ Note                 <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## Rows: 178,482
## Columns: 11
## $ Jurisdiction <chr> "Alabama", "Alabama", "Alabama", "Alabama", "Alabama",...
## $ weekEnding   <date> 2015-01-10, 2015-01-17, 2015-01-24, 2015-01-31, 2015-...
## $ state        <chr> "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", ...
## $ year         <int> 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015, ...
## $ week         <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,...
## $ age          <chr> "25-44 years", "25-44 years", "25-44 years", "25-44 ye...
## $ deaths       <dbl> 67, 49, 55, 59, 47, 59, 41, 47, 59, 57, 54, 50, 58, 42...
## $ period       <chr> "2015-2019", "2015-2019", "2015-2019", "2015-2019", "2...
## $ type         <chr> "Predicted (weighted)", "Predicted (weighted)", "Predi...
## $ Suppress     <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## $ Note         <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## 
## Check Control Levels and Record Counts for Renamed Data:
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 6 x 4
##   age                    n n_deaths_na   deaths
##   <chr>              <int>       <int>    <dbl>
## 1 25-44 years        26725           5  3266337
## 2 45-64 years        32640          10 12792834
## 3 65-74 years        32632          12 12691956
## 4 75-84 years        32652          14 15776825
## 5 85 years and older 32640          15 20567595
## 6 Under 25 years     21193           0  1407550
## `summarise()` regrouping output by 'period', 'year' (override with `.groups` argument)
## # A tibble: 12 x 6
## # Groups:   period, year [6]
##    period     year type                     n n_deaths_na  deaths
##    <chr>     <int> <chr>                <int>       <int>   <dbl>
##  1 2015-2019  2015 Predicted (weighted) 15285           0 5416393
##  2 2015-2019  2015 Unweighted           15285           0 5416393
##  3 2015-2019  2016 Predicted (weighted) 15365           0 5483764
##  4 2015-2019  2016 Unweighted           15365           0 5483764
##  5 2015-2019  2017 Predicted (weighted) 15317           0 5643342
##  6 2015-2019  2017 Unweighted           15317           0 5643342
##  7 2015-2019  2018 Predicted (weighted) 15305           0 5698005
##  8 2015-2019  2018 Unweighted           15305           0 5698005
##  9 2015-2019  2019 Predicted (weighted) 15319           0 5725544
## 10 2015-2019  2019 Unweighted           15319           0 5725544
## 11 2020       2020 Predicted (weighted) 12677          34 5332986
## 12 2020       2020 Unweighted           12623          22 5236015
## `summarise()` regrouping output by 'period' (override with `.groups` argument)
## # A tibble: 3 x 5
## # Groups:   period [2]
##   period    Suppress                                       n n_deaths_na  deaths
##   <chr>     <chr>                                      <int>       <int>   <dbl>
## 1 2015-2019 <NA>                                      153182           0  5.59e7
## 2 2020      Suppressed (counts highly incomplete, <5~     56          56  0.    
## 3 2020      <NA>                                       25244           0  1.06e7
## `summarise()` regrouping output by 'period' (override with `.groups` argument)
## # A tibble: 9 x 5
## # Groups:   period [2]
##   period   Note                                            n n_deaths_na  deaths
##   <chr>    <chr>                                       <int>       <int>   <dbl>
## 1 2015-20~ <NA>                                       153182           0  5.59e7
## 2 2020     Data in recent weeks are incomplete. Only~  19873          10  8.61e6
## 3 2020     Data in recent weeks are incomplete. Only~    456           0  2.12e5
## 4 2020     Data in recent weeks are incomplete. Only~    384          35  4.69e4
## 5 2020     Data in recent weeks are incomplete. Only~   2213          11  7.28e5
## 6 2020     Data in recent weeks are incomplete. Only~     12           0  7.12e3
## 7 2020     Estimates for Pennsylvania are too low fo~     48           0  2.26e4
## 8 2020     Weights may be too low to account for und~    328           0  1.25e5
## 9 2020     <NA>                                         1986           0  8.13e5
## `summarise()` regrouping output by 'state' (override with `.groups` argument)
##    state         Jurisdiction    n n_deaths_na   deaths
## 1     US        United States 3636           0 33131804
## 2     CA           California 3636           0  3152950
## 3     FL              Florida 3636           0  2423089
## 4     TX                Texas 3636           0  2383305
## 5     PA         Pennsylvania 3636           0  1589393
## 6     OH                 Ohio 3636           0  1432144
## 7     IL             Illinois 3636           0  1250417
## 8     NY             New York 3636           0  1183446
## 9     MI             Michigan 3636           0  1140202
## 10    NC       North Carolina 3568          28  1066679
## 11    GA              Georgia 3636           0   994220
## 12    NJ           New Jersey 3630           0   886576
## 13    TN            Tennessee 3636           0   865673
## 14    VA             Virginia 3636           0   795842
## 15    IN              Indiana 3632           0   769795
## 16    MO             Missouri 3633           0   749309
## 17    AZ              Arizona 3636           0   702644
## 18    MA        Massachusetts 3602           0   701039
## 19    YC        New York City 3632           0   685046
## 20    WA           Washington 3636           0   662228
## 21    AL              Alabama 3635           0   615046
## 22    WI            Wisconsin 3618           0   608567
## 23    MD             Maryland 3630           0   585192
## 24    SC       South Carolina 3634           0   576703
## 25    KY             Kentucky 3599           0   560150
## 26    LA            Louisiana 3628           0   541388
## 27    MN            Minnesota 3592           0   517087
## 28    CO             Colorado 3634           0   458526
## 29    OK             Oklahoma 3622           9   455633
## 30    OR               Oregon 3466           0   424398
## 31    MS          Mississippi 3574           0   374274
## 32    AR             Arkansas 3532           0   372266
## 33    CT          Connecticut 3191          14   365303
## 34    IA                 Iowa 3271           0   349432
## 35    PR          Puerto Rico 3347           0   340685
## 36    KS               Kansas 3327           0   304875
## 37    NV               Nevada 3372           0   297386
## 38    WV        West Virginia 3075           2   257638
## 39    UT                 Utah 3522           0   219851
## 40    NM           New Mexico 3210           0   210537
## 41    NE             Nebraska 2920           0   193764
## 42    ME                Maine 2712           0   164439
## 43    ID                Idaho 2834           0   157266
## 44    NH        New Hampshire 2734           0   138417
## 45    HI               Hawaii 2625           0   127727
## 46    RI         Rhode Island 2534           3   116993
## 47    MT              Montana 2618           0   113028
## 48    DE             Delaware 2627           0   102300
## 49    SD         South Dakota 2510           0    89238
## 50    ND         North Dakota 2496           0    77815
## 51    DC District of Columbia 2609           0    65724
## 52    VT              Vermont 2392           0    63461
## 53    WY              Wyoming 2379           0    48740
## 54    AK               Alaska 2412           0    43447
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## Rows: 178,482
## Columns: 11
## $ Jurisdiction <chr> "Alabama", "Alabama", "Alabama", "Alabama", "Alabama",...
## $ weekEnding   <date> 2015-01-10, 2015-01-17, 2015-01-24, 2015-01-31, 2015-...
## $ state        <chr> "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", ...
## $ year         <fct> 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015, ...
## $ week         <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,...
## $ age          <fct> 25-44 years, 25-44 years, 25-44 years, 25-44 years, 25...
## $ deaths       <dbl> 67, 49, 55, 59, 47, 59, 41, 47, 59, 57, 54, 50, 58, 42...
## $ period       <fct> 2015-2019, 2015-2019, 2015-2019, 2015-2019, 2015-2019,...
## $ type         <chr> "Predicted (weighted)", "Predicted (weighted)", "Predi...
## $ Suppress     <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## $ Note         <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## Rows: 87,264
## Columns: 11
## $ Jurisdiction <chr> "Alabama", "Alabama", "Alabama", "Alabama", "Alabama",...
## $ weekEnding   <date> 2015-01-10, 2015-01-17, 2015-01-24, 2015-01-31, 2015-...
## $ state        <chr> "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", ...
## $ year         <fct> 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015, ...
## $ week         <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,...
## $ age          <fct> 25-44 years, 25-44 years, 25-44 years, 25-44 years, 25...
## $ deaths       <dbl> 67, 49, 55, 59, 47, 59, 41, 47, 59, 57, 54, 50, 58, 42...
## $ period       <fct> 2015-2019, 2015-2019, 2015-2019, 2015-2019, 2015-2019,...
## $ type         <chr> "Predicted (weighted)", "Predicted (weighted)", "Predi...
## $ Suppress     <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## $ Note         <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## 
## 
##  *** Data suppression checks *** 
## # A tibble: 5 x 11
##   Jurisdiction weekEnding state year   week age   deaths period type  Suppress
##   <chr>        <date>     <chr> <fct> <int> <fct>  <dbl> <fct>  <chr> <chr>   
## 1 North Carol~ 2020-09-05 NC    2020     36 25-4~     NA 2020   Pred~ Suppres~
## 2 North Carol~ 2020-09-05 NC    2020     36 45-6~     NA 2020   Pred~ Suppres~
## 3 North Carol~ 2020-09-05 NC    2020     36 65-7~     NA 2020   Pred~ Suppres~
## 4 North Carol~ 2020-09-05 NC    2020     36 75-8~     NA 2020   Pred~ Suppres~
## 5 North Carol~ 2020-09-05 NC    2020     36 85 y~     NA 2020   Pred~ Suppres~
## # ... with 1 more variable: Note <chr>
## 
## Data suppression checks OK - 5 records in current week/year suppressed
## `summarise()` regrouping output by 'Jurisdiction', 'weekEnding', 'state', 'year', 'week', 'age', 'period', 'type' (override with `.groups` argument)
## Rows: 82,069
## Columns: 12
## $ Jurisdiction <chr> "Alabama", "Alabama", "Alabama", "Alabama", "Alabama",...
## $ weekEnding   <date> 2015-01-10, 2015-01-10, 2015-01-10, 2015-01-10, 2015-...
## $ state        <chr> "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", ...
## $ year         <fct> 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015, ...
## $ week         <int> 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, ...
## $ age          <fct> Under 25 years, 25-44 years, 45-64 years, 65-74 years,...
## $ period       <fct> 2015-2019, 2015-2019, 2015-2019, 2015-2019, 2015-2019,...
## $ type         <chr> "Predicted (weighted)", "Predicted (weighted)", "Predi...
## $ Suppress     <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## $ n            <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
## $ deaths       <dbl> 25, 67, 253, 202, 272, 320, 28, 49, 256, 222, 253, 332...
## $ Note         <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## 
## First duplicate is in row number (0 means no duplicates): 0
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` regrouping output by 'cluster' (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)

## `summarise()` regrouping output by 'year' (override with `.groups` argument)

## `summarise()` regrouping output by 'year', 'week' (override with `.groups` argument)

## `summarise()` regrouping output by 'year', 'week' (override with `.groups` argument)

## `summarise()` regrouping output by 'year', 'age', 'week' (override with `.groups` argument)

## `summarise()` regrouping output by 'year' (override with `.groups` argument)

## `summarise()` ungrouping output (override with `.groups` argument)

## `summarise()` regrouping output by 'year' (override with `.groups` argument)

## `summarise()` regrouping output by 'year' (override with `.groups` argument)

## `summarise()` regrouping output by 'year' (override with `.groups` argument)

## `summarise()` regrouping output by 'year' (override with `.groups` argument)

## `summarise()` regrouping output by 'year' (override with `.groups` argument)

## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'state', 'quarter', 'month' (override with `.groups` argument)
## `summarise()` regrouping output by 'state' (override with `.groups` argument)

## `summarise()` regrouping output by 'state' (override with `.groups` argument)

## `summarise()` ungrouping output (override with `.groups` argument)
## Joining, by = "state"

## `summarise()` regrouping output by 'year' (override with `.groups` argument)

## `summarise()` regrouping output by 'year' (override with `.groups` argument)

## `summarise()` regrouping output by 'year' (override with `.groups` argument)

## `summarise()` regrouping output by 'year' (override with `.groups` argument)

## `summarise()` regrouping output by 'year' (override with `.groups` argument)

## `summarise()` regrouping output by 'year' (override with `.groups` argument)

## `summarise()` regrouping output by 'age', 'quarter', 'month' (override with `.groups` argument)
## `summarise()` regrouping output by 'age' (override with `.groups` argument)

## `summarise()` ungrouping output (override with `.groups` argument)

saveToRDS(cdcList_20201110, ovrWriteError=FALSE)
## 
## File already exists: ./RInputFiles/Coronavirus/cdcList_20201110.RDS 
## 
## Not replacing the existing file since ovrWrite=FALSE
## NULL

The death data can then be combined and analyzed, using the function from Coronavirus_Statistics_States_v003:

combineDeathData <- function(ctp, 
                             usaf, 
                             cdc, 
                             keyState, 
                             curYear=2020,
                             minDate=as.Date(paste0(curYear, "-01-01")), 
                             perMillion=FALSE,
                             glimpseIntermediate=FALSE, 
                             facetFreeY=!perMillion, 
                             returnData=TRUE
                             ) {
    
    # FUNCTION ARGUMENTS:
    # ctp: the list with COVID Tracking Project data
    # usaf: the data frame with USA Facts data
    # cdc: the list with CDC data
    # keyState: the state(s) to be explored
    # curYear: current year
    # minDate: the minimum date to use in the CDC data
    # perMillion: boolean, should data be show on a per-million-people basis?
    # glimpseIntermediate: boolean, should glimpses of frames be provided as they are built?
    # facetFreeY: boolean, should facets be created with free_y scales (only relevant if 2+ keyStates)
    # returnData: boolean, should the data frame be returned?
    
    # STEP 0a: Extract relevant elements from lists (use frame as-is if directly passed)
    if ("list" %in% class(ctp)) ctp <- ctp[["consolidatedPlotData"]]
    if ("list" %in% class(usaf)) usaf <- usaf[["clusterStateData"]]
    if ("list" %in% class(cdc)) cdc <- cdc[["stateAgg"]]
    
    # STEP 0b: Create a mapping file of date to epiWeek
    epiMap <- tibble::tibble(date=seq.Date(from=minDate, to=as.Date(paste0(curYear, "-12-31")), by="1 day"), 
                             week=lubridate::epiweek(date)
                             )
    
    # STEP 1: Filter to only relevant data
    # STEP 1a: COVID Tracking Project
    ctp <- ctp %>%
        ungroup() %>%
        filter(name=="deaths", state %in% keyState)
    if(glimpseIntermediate) glimpse(ctp)
    # STEP 1b: USA Facts
    usaf <- usaf %>%
        ungroup() %>%
        filter(state %in% keyState)
    if(glimpseIntermediate) glimpse(usaf)
    # STEP 1c: CDC
    cdc <- cdc %>%
        ungroup() %>%
        filter(year==curYear, state %in% keyState)
    if(glimpseIntermediate) glimpse(cdc)
    
    # STEP 2a: Sum the county-level data so that it is state-level data
    usafState <- usaf %>%
        group_by(state, date) %>%
        summarize(deaths=sum(deaths), dpm7=sum(dpm7*pop)/sum(pop), pop=sum(pop), .groups="drop_last") %>%
        ungroup()
    # STEP 2b: Convert the CDC data to an estimated daily total (split the weekly total evenly)
    cdcDaily <- cdc %>%
        left_join(epiMap, by=c("week")) %>%
        select(state, week, date, cdcDeaths=deaths, cdcExcess=delta) %>%
        mutate(cdcDeaths=cdcDeaths/7, cdcExcess=cdcExcess/7)
    
    # STEP 3: Create a state death-level database by date
    dailyDeath <- select(ctp, state, date, ctpDeaths=value, ctp_dpm7=vpm7, ctp_pop=pop) %>%
        full_join(select(usafState, state, date, usafDeaths=deaths, usaf_dpm7=dpm7, usaf_pop=pop), 
                  by=c("state", "date")
                  ) %>%
        full_join(cdcDaily, by=c("state", "date")) %>%
        arrange(state, date) %>%
        mutate(ctp_death7=ctp_dpm7*ctp_pop/1000000, usaf_death7=usaf_dpm7*usaf_pop/1000000)
    if(glimpseIntermediate) glimpse(dailyDeath)

    # STEP 4a: Assign a population by state
    statePop <- dailyDeath %>%
        group_by(state) %>%
        summarize(pop=max(usaf_pop, ctp_pop, na.rm=TRUE), .groups="drop_last")
    
    # STEP 4b: Plot the deaths data
    p1 <- dailyDeath %>%
        select(state, date, ctp_death7, usaf_death7, cdcExcess) %>%
        pivot_longer(-c(state, date), names_to="source", values_to="deaths") %>%
        filter(!is.na(deaths)) %>%
        left_join(statePop, by="state") %>%
        ggplot(aes(x=date, y=deaths*if(perMillion) (1000000/pop) else 1)) + 
        geom_line(aes(group=source, color=varMapper[source])) + 
        labs(x="", 
             y=paste0("Deaths", if(perMillion) " per million" else ""), 
             title=paste0(curYear, " deaths per day in ", paste0(keyState, collapse=", ")),
             subtitle=paste0("Rolling 7-day average", if(perMillion) " per million people" else ""),
             caption="CDC estimated excess all-cause deaths, weekly total divided by 7 to estimate daily total"
             ) + 
        scale_x_date(date_breaks="1 month", date_labels="%b") + 
        scale_color_discrete("Data source") + 
        theme(legend.position="bottom") + 
        geom_hline(yintercept=0, lty=2)
    if (length(keyState) > 1) p1 <- p1 + facet_wrap(~state, scales=if(facetFreeY) "free_y" else "fixed")
    print(p1)
    
    # STEP 5: Return the daily death file
    if(returnData) dailyDeath
    
}


# Example function
combineDeathData(ctp=test_old_201108, 
                 usaf=cty_old_20201109$clusterStateData, 
                 cdc=cdcList_20201110, 
                 keyState=c("NY", "NJ", "MA", "FL", "GA", "TX", "AZ", "MS", "LA", "MI", "IL", "WI"), 
                 perMillion=FALSE, 
                 returnData=FALSE
                 )

combineDeathData(ctp=test_old_201108, 
                 usaf=cty_old_20201109$clusterStateData, 
                 cdc=cdcList_20201110, 
                 keyState=c("NY", "NJ", "MA", "FL", "GA", "TX", "AZ", "MS", "LA", "MI", "IL", "WI"), 
                 perMillion=TRUE, 
                 returnData=FALSE
                 )

In general, the shapes of the curves are well aligned, though the CDC excess-deaths generally start earlier and peak higher than the coronavirus deaths curves from COVID Tracking Project and USA Facts. In general, the northeastern states hit early have a classic epidemic peak curve, while the states hit later tend to have lower peaks (per million) but, in some cases, a much extended timeline of non-zero disease impact.

Next, an integrated state data file is created from the latest data:

ctpList <- readFromRDS("test_old_201108")
usafData <- readFromRDS("cty_old_20201109")$clusterStateData
cdcList <- readFromRDS("cdcList_20201110")

# Function to convert a COVID Tracking Project file for further processing
prepCTPData <- function(ctp) {
    
    # FUNCTION AGRUMENTS:
    # ctp: a properly formatted list or data frame containing processed COVID Tracking Project data

    # Pull the relevant data frame if a list has been passed    
    if ("list" %in% class(ctp)) ctp <- ctp[["consolidatedPlotData"]]

    # Ungroup the data, delete the state named 'cluster', and Create a value7 metric
    ctp <- ctp %>%
        ungroup() %>%
        filter(state != "cluster") %>%
        mutate(value7=ifelse(is.na(vpm7), NA, vpm7*pop/1000000))
    
    # Split state-cluster-population as a separate file unique by state
    ctpDemo <- ctp %>%
        group_by(state, cluster) %>%
        summarize(pop=max(pop, na.rm=TRUE), .groups="drop_last") %>%
        ungroup()
    
    # Create a final data file with the key elements
    ctpData <- ctp %>%
        rename(metric=name) %>%
        mutate(source="CTP", name=paste0(source, "_", metric)) %>%
        select(state, date, metric, source, name, value, value7, vpm, vpm7)
    
    # Return the key data frames
    list(ctpDemo=ctpDemo, ctpData=ctpData)
    
}

ctpPrepped <- prepCTPData(ctpList)



# Function to convert a USA Facts file for further processing
prepUSAFData <- function(usaf) {
    
    # FUNCTION AGRUMENTS:
    # usaf: a properly formatted list or data frame containing processed USA Facts data

    # Pull the relevant data frame if a list has been passed    
    if ("list" %in% class(usaf)) usaf <- usaf[["clusterStateData"]]

    # Sum the data to state, keeping only state-date-pop-cases-deaths, then pivot longer
    usaf <- usaf %>%
        group_by(state, date) %>%
        summarize(cases=sum(cases), deaths=sum(deaths), pop=sum(pop), .groups="drop_last") %>%
        ungroup() %>%
        pivot_longer(-c(state, date, pop), names_to="metric", values_to="value")
    
    # Create the rolling-7 for value, having grouped by state-pop-metric and sorted by date
    # Add the per million component
    usaf <- usaf %>%
        group_by(state, pop, metric) %>%
        arrange(date) %>%
        helperRollingAgg(origVar="value", newName="value7") %>%
        ungroup() %>%
        mutate(vpm=value*1000000/pop, vpm7=value7*1000000/pop)
    
    # Split state-pop as a separate file unique by state
    usafDemo <- usaf %>%
        group_by(state) %>%
        summarize(pop=max(pop, na.rm=TRUE), .groups="drop_last") %>%
        ungroup()
    
    # Create a final data file with the key elements
    usafData <- usaf %>%
        mutate(source="USAF", name=paste0(source, "_", metric)) %>%
        select(state, date, metric, source, name, value, value7, vpm, vpm7)
    
    # Return the key data frames
    list(usafDemo=usafDemo, usafData=usafData)
    
}

usafPrepped <- prepUSAFData(usafData)



# Function to convert a CDC file for further processing
prepCDCData <- function(cdc, 
                        popData,
                        startYear=2020, 
                        startDate=as.Date(paste0(startYear, "-01-01")), 
                        endDate=as.Date(paste0(startYear, "-12-31"))
                        ) {
    
    # FUNCTION AGRUMENTS:
    # cdc: a properly formatted list or data frame containing processed CDC data
    # popData: a file containing fields state-pop
    # startYear: starting year (CDC data will be filtered for this year and later)
    # startDate: the starting date for use in the mapping file to create daily estimates
    # endDate: the ending date for use in the mapping file to create daily estimates

    # Pull the relevant data frame if a list has been passed    
    if ("list" %in% class(cdc)) cdc <- cdc[["stateAgg"]]

    # Create a mapping file of dates to epiweek-epiyear
    epiMap <- tibble::tibble(date=seq.Date(from=startDate, to=endDate, by="1 day"), 
                             year=lubridate::epiyear(date),
                             week=lubridate::epiweek(date)
                             )
    
    # Filter the data to the relevant year and keep state-year-week-deaths-excess
    cdc <- cdc %>%
        filter(yearint >= startYear) %>%
        select(state, yearint, week, deaths, excess=delta)

    # Merge in the daily mapping file, divide all totals by 7 to reflect weekly to daily, and pivot longer
    cdc <- cdc %>%
        left_join(epiMap, by=c("yearint"="year", "week"="week")) %>%
        mutate(deaths=deaths/7, excess=excess/7) %>%
        select(state, date, deaths, excess) %>%
        pivot_longer(-c(state, date), names_to="metric", values_to="value")
    
    # Create the rolling-7 for value, having grouped by state-metric and sorted by date
    # Add the per million component
    cdc <- cdc %>%
        group_by(state, metric) %>%
        arrange(date) %>%
        helperRollingAgg(origVar="value", newName="value7") %>%
        ungroup() %>%
        left_join(select(popData, state, pop), by="state") %>%
        mutate(vpm=value*1000000/pop, 
               vpm7=value7*1000000/pop, 
               source="CDC", 
               name=paste0(source, "_", metric)
               ) %>%
        select(state, date, pop, metric, source, name, value, value7, vpm, vpm7)
    
    # Return the key data frame as a list
    list(cdcDemo=select(cdc, state, pop), cdcData=select(cdc, -pop))
    
}

# Create an integrated state demographics file
demoData <- ctpPrepped$ctpDemo %>%
    rename(popCTP=pop) %>%
    full_join(rename(usafPrepped$usafDemo, popUSAF=pop), by="state") %>%
    mutate(pop=pmax(popCTP, popUSAF))

cdcPrepped <- prepCDCData(cdcList, popData=demoData)



# Integrated state data
stateData <- ctpPrepped$ctpData %>%
    bind_rows(usafPrepped$usafData) %>%
    bind_rows(cdcPrepped$cdcData)
glimpse(stateData)
## Rows: 104,103
## Columns: 9
## $ state  <chr> "AK", "AK", "AK", "AK", "AK", "AK", "AK", "AK", "AK", "AK", ...
## $ date   <date> 2020-03-06, 2020-03-07, 2020-03-08, 2020-03-09, 2020-03-10,...
## $ metric <chr> "cases", "cases", "cases", "cases", "cases", "cases", "cases...
## $ source <chr> "CTP", "CTP", "CTP", "CTP", "CTP", "CTP", "CTP", "CTP", "CTP...
## $ name   <chr> "CTP_cases", "CTP_cases", "CTP_cases", "CTP_cases", "CTP_cas...
## $ value  <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 2, 3, 0, 6, 2, 8, 0, 14, 6,...
## $ value7 <dbl> NA, NA, NA, 0.0000000, 0.1428571, 0.1428571, 0.1428571, 0.14...
## $ vpm    <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, ...
## $ vpm7   <dbl> NA, NA, NA, 0.0000000, 0.1934601, 0.1934601, 0.1934601, 0.19...
# Control totals
stateData %>%
    group_by(name) %>%
    summarize(value=sum(value, na.rm=TRUE), value7=sum(value7, na.rm=TRUE), .groups="drop_last")
## # A tibble: 8 x 3
##   name             value     value7
##   <chr>            <dbl>      <dbl>
## 1 CDC_deaths    2224832.   2173735.
## 2 CDC_excess     242466.    240886.
## 3 CTP_cases     9717214    9372412.
## 4 CTP_deaths     228262     224869.
## 5 CTP_hosp      9301372    9036822.
## 6 CTP_tests   154892254  150735326.
## 7 USAF_cases    9679664    9336904.
## 8 USAF_deaths    233286     230182.

The alignment of cases and deaths data can then be plotted:

plotStateMetric <- function(df, 
                            yVal, 
                            namesPlot, 
                            keyStates, 
                            namesSec=NULL,
                            scaleSec=NULL,
                            plotTitle=NULL,
                            plotSub=NULL,
                            plotCaption=NULL,
                            primYLab=NULL,
                            secYLab="Caution, different metric and scale",
                            facetFixed=TRUE,
                            mapper=varMapper, 
                            combStates=vector("character", 0), 
                            popData=NULL, 
                            yValPerCap=(yVal %in% c("vpm", "vpm7")), 
                            printPlot=TRUE, 
                            returnData=FALSE
                            ) {
    
    # FUNCTION ARGUMENTS:
    # df: data frame with integrated state data
    # yVal: column to use for the yValues
    # namesPlot: the values of column 'name' to be kept and plotted
    # keyStates: states to be included
    #            if more than one state is passed, facets will be created
    # namesSec: names to be plotted on a secondary y-axes
    # scaleSec: scale to be used for the secondary axis 
    #           namesSec/scaleSec should be similar in magnitude to namesPlot
    # plotTitle: plot title to be used (NULL means none)
    # plotSub: plot subtitle to be used (NULL means none)
    # plotCaption: plot caption to be used (NULL means none)
    # primYLab: primary y label (NULL means use mapper)
    # secYLab: secondary y label (default is "Caution, different metric and scale")
    # facetFixed: boolean, if TRUE scales="fixed", if FALSE scales="free_y"
    #             only relevant if length(keyStates) > 1
    # mapper: mapping file for variable names to descriptive names
    # combStates: states that should be combined together for plotting (named vector, c("state"="newName"))
    # popData: a population file for combining states
    # yValPerCap: boolean, is the y-value of type per-capita?
    # printPlot: boolean, whether to print the plots
    # returnData: boolean, whether to return the data
    
    # Routine is only set up for a secondary axis with facetFixed=TRUE
    if (!is.null(namesSec) & !facetFixed) stop("\nSecondary axis only programmed for scales='fixed'\n")
    
    # Include variables in namesSec as part of namesPlot so they are kept by filter
    if (!is.null(namesSec)) namesPlot <- unique(c(namesPlot, namesSec))
    
    # Filter the data for only the key elements
    df <- df %>%
        select_at(vars(all_of(c("state", "date", "name", yVal)))) %>%
        filter(state %in% keyStates, name %in% namesPlot)
    
    # If there is a list of states to combine, process them
    if (length(combStates) > 0) {
        if (is.null(popData)) { stop("\nCombining states requires population data\n") }
        # Create a data frame with population and new state names
        df <- df %>%
            left_join(select(popData, state, pop), by="state") %>%
            mutate(state=ifelse(state %in% names(combStates), combStates[state], state))
        # Aggregate to the new 'state' level data
        if (yValPerCap) {
            df <- df %>%
                group_by(state, date, name) %>%
                filter(!is.na(get(yVal))) %>%  # only sum population where yVal exists
                summarize(!!yVal:=sum(get(yVal)*pop)/sum(pop), pop=sum(pop), .groups="drop_last")
        } else {
            df <- df %>%
                group_by(state, date, name) %>%
                filter(!is.na(get(yVal))) %>% # only sum population where yVal exists
                summarize(!!yVal:=sum(get(yVal)), pop=sum(pop), .groups="drop_last")
        }
        # Ungroup data frame
        df <- df %>%
            ungroup()
    }
    
    # If there is a secondary scale but no scaleSec has been passed, create one
    if (!is.null(namesSec) & is.null(scaleSec)) {
        maxPrimary <- df %>%
            filter(name %in% setdiff(namesPlot, namesSec)) %>%
            summarize(max(get(yVal), na.rm=TRUE), .groups="drop_last") %>%
            max()
        maxSecondary <- df %>%
            filter(name %in% namesSec) %>%
            summarize(max(get(yVal), na.rm=TRUE), .groups="drop_last") %>%
            max()
        scaleSec <- maxSecondary/maxPrimary
        cat("\nWill scale by:", scaleSec, "\n")
    }
    
    # Create the primary y-axis label from mapper if it has not been passed
    if (is.null(primYLab)) primYLab <- mapper[yVal]
    
    # Create the relevant line plot
    if (printPlot) {
        p1 <- df %>%
            filter(!is.na(get(yVal))) %>%
            ggplot(aes_string(x="date")) + 
            geom_line(data=~filter(., name %in% setdiff(namesPlot, namesSec)), 
                      aes(y=get(yVal), group=name, color=mapper[name])
                      ) + 
            scale_x_date(date_breaks="1 month", date_labels="%b") + 
            geom_hline(aes(yintercept=0), lty=2) + 
            labs(x="") + 
            theme(axis.text.x = element_text(angle = 90))
        if (!is.null(namesSec)) {
            p1 <- p1 + 
                geom_line(data=~filter(., name %in% namesSec), 
                          aes(y=get(yVal)/scaleSec, color=mapper[name], group=name)
                          ) + 
                scale_y_continuous(name=primYLab, 
                                   sec.axis=sec_axis(~.*scaleSec, name=secYLab)
                                   )
        } else {
            p1 <- p1 + scale_y_continuous(name=primYLab)
        }
        if (length(keyStates) > 1) p1 <- p1 + facet_wrap(~state, scales=if(facetFixed) "fixed" else "free_y")
        if (!is.null(plotTitle)) p1 <- p1 + labs(title=plotTitle)
        if (!is.null(plotSub)) p1 <- p1 + labs(subtitle=plotSub)
        if (!is.null(plotCaption)) p1 <- p1 + labs(caption=plotCaption)
        p1 <- p1 + scale_color_discrete("Source and metric")
        print(p1)
    }
    
    if (returnData) return(df)
    
}

# Example of combining states
ne_casedeath <- plotStateMetric(stateData, 
                                yVal="vpm7", 
                                namesPlot=c("CTP_cases"),
                                namesSec=c("CTP_deaths"), 
                                keyStates=c("NY", "NJ", "MA", "CT", "RI", "NH", "VT", "ME"),
                                combStates=c("MA"="S NE", "CT"="S NE", "RI"="S NE", 
                                             "NH"="N NE", "VT"="N NE", "ME"="N NE"
                                             ),
                                plotTitle="2020 coronavirus burden per million per day (select states)", 
                                plotSub="Cases on main y-axis, deaths on secondary y-axis", 
                                primYLab="Cases per million (7-day rolling mean)",
                                secYLab="Deaths per million (7-day rolling mean)",
                                facetFixed=TRUE, 
                                popData=usafPrepped$usafDemo,
                                returnData=TRUE
                                )
## 
## Will scale by: 0.07901078

ne_casedeath
## # A tibble: 2,076 x 5
##    state date       name        vpm7     pop
##    <chr> <date>     <chr>      <dbl>   <dbl>
##  1 N NE  2020-03-06 CTP_cases  0.228  626042
##  2 N NE  2020-03-06 CTP_deaths 0      626042
##  3 N NE  2020-03-07 CTP_cases  0.219 1956650
##  4 N NE  2020-03-07 CTP_deaths 0     1956650
##  5 N NE  2020-03-08 CTP_cases  0.292 1956650
##  6 N NE  2020-03-08 CTP_deaths 0     1956650
##  7 N NE  2020-03-09 CTP_cases  0.438 1956650
##  8 N NE  2020-03-09 CTP_deaths 0     1956650
##  9 N NE  2020-03-10 CTP_cases  0.522 3285978
## 10 N NE  2020-03-10 CTP_deaths 0     3285978
## # ... with 2,066 more rows

An attempt is made to align the curves for two different metrics in a single locale:

alignCurves <- function(df, 
                        valueMetric, 
                        depName,
                        indepName=setdiff(unique(df$name), depName),
                        lagsTry=0:30, 
                        yLabel="Deaths per million", 
                        depLabel="cases",
                        textMetric=stringr::str_split(yLabel, pattern=" ")[[1]][1] %>% stringr::str_to_lower()
                        ) {
    
    # FUNCTION ARGUMENTS
    # df: a data frame containing state-date-name-valueMetric, with only 2 value types in 'name'
    # valueMetric: the name of the value metric
    # depName: the name of the dependent variable (the other will be the predictor)
    # indepName: the name of the predictor variable
    # lagsTry: the lagged values to attempt
    # yLabel: label for the y-axis
    # depLabel: label for the title (regression x-variable name)
    # textMetric: label for the title (regression y-variable name)
    
    # Check that there are only two values in column 'name'
    if (length(unique(df$name))!=2) { stop("\nFunction depends on 'name' having only two possible values\n") }
    
    # Arrange the data by state and date
    df <- df %>%
        arrange(state, date)
    
    # Function to make a data frame with a specific lag
    helperMakeLagData <- function(df, depName, indepName, valueMetric, lagValue) {
        depData <- df %>%
            filter(name==depName) %>%
            select_at(vars(all_of(c("state", "date", valueMetric)))) %>%
            purrr::set_names(c("state", "date", "depVar"))
        indepData <- df %>%
            filter(name==indepName) %>%
            group_by(state) %>%
            mutate(indepVar=lag(get(valueMetric), lagValue)) %>%
            ungroup() %>%
            select(state, date, indepVar)
        fullData <- depData %>%
            full_join(indepData, by=c("state", "date"))
        fullData
    }
    
    # Run a simple linear model for depName ~ lag(otherName, lagsTry) to assess performance
    lmResults <- vector("list", length(lagsTry))
    n <- 1
    for (lagValue in lagsTry) {
        # Run the linear model with no intercept, save, and increment
        lmResults[[n]] <- lm(depVar ~ indepVar:state + 0, 
                             data=helperMakeLagData(df, 
                                                    depName=depName, 
                                                    indepName=indepName, 
                                                    valueMetric=valueMetric, 
                                                    lagValue=lagValue
                                                    )
                             )
        n <- n + 1
    }
    
    # Find the best lag and coefficients
    dfResults <- tibble::tibble(lags=lagsTry, 
                                rsq=sapply(lmResults, FUN=function(x) summary(x)$r.squared)
                                )
    p1 <- dfResults %>%
        ggplot(aes(x=lags, y=rsq)) + 
        geom_point() + 
        labs(x="Lag", y="R-squared", title="R-squared vs. lag for aligning curves")
    print(p1)
    
    # Calculate the best lag and coefficients
    bestLag <- dfResults %>%
        filter(rsq==max(rsq)) %>%
        pull(lags)
    bestCoef <- coef(lmResults[[which(lagsTry==bestLag)]]) %>%
        as.data.frame() %>% 
        purrr::set_names("mult") %>%
        tibble::rownames_to_column("state") %>%
        mutate(state=str_replace(state, "indepVar:state", ""))
    
    # Plot the curves using the coefficients and lags
    bestDF <- helperMakeLagData(df, 
                                depName=depName, 
                                indepName=indepName, 
                                valueMetric=valueMetric, 
                                lagValue=bestLag
                                ) %>%
        filter(!is.na(indepVar)) %>%
        left_join(bestCoef, by="state") %>%
        mutate(pred=mult*indepVar)
    p2 <- bestDF %>%
        select(state, date, depVar, pred, mult) %>%
        pivot_longer(-c(state, date, mult)) %>%
        mutate(name=case_when(name=="depVar" ~ "Actual value", 
                              name=="pred" ~ "Predicted value\n(lag, mult)", 
                              TRUE ~ "Unknown Element"
                              )
               ) %>%
        ggplot(aes(x=date, y=value)) + 
        geom_line(aes(group=name, color=name)) + 
        geom_text(data=~filter(., date==max(date)), 
                  aes(x=date, y=+Inf, label=paste0("Multiplier: ", round(mult, 3))), 
                  hjust=1, 
                  vjust=1
                  ) +
        labs(x="", 
             y=yLabel, 
             title=paste0("Predicting ", 
                          textMetric, 
                          " based on lagged ", 
                          depLabel, 
                          " (best lag: ", 
                          bestLag, 
                          " days)"
                          ),
             subtitle="Best lag is based on highest correlation/R-squared, common across all facets"
             ) + 
        facet_wrap(~state) + 
        scale_x_date(date_breaks="1 month", date_labels="%b") + 
        theme(axis.text.x = element_text(angle = 90)) + 
        scale_color_discrete("Metric")
    print(p2)
    
    # Return the key data
    list(bestLag=bestLag, bestCoef=bestCoef, bestDF=bestDF, lmResults=lmResults)
    
}



createAndAlignCurves <- function(df, 
                                 yVal, 
                                 namesPlot, 
                                 keyStates,
                                 lagValueMetric,
                                 lagDepName,
                                 namesSec=NULL,
                                 scaleSec=NULL,
                                 plotTitle=NULL,
                                 plotSub=NULL,
                                 plotCaption=NULL,
                                 primYLab=NULL,
                                 secYLab="Caution, different metric and scale",
                                 facetFixed=TRUE,
                                 mapper=varMapper, 
                                 combStates=vector("character", 0), 
                                 popData=NULL, 
                                 yValPerCap = (yVal %in% c("vpm", "vpm7")), 
                                 printPlot=TRUE, 
                                 ...
                                 ) {
    
    # FUNCTION ARGUMENTS:
    # df: data frame with integrated state data
    # yVal: column to use for the yValues
    # namesPlot: the values of column 'name' to be kept and plotted
    # keyStates: states to be included
    #            if more than one state is passed, facets will be created
    # lagValueMetric: the metric to be used for checking lags (typically 'vpm7')
    # lagDepName: dependent variable (records in column 'name') to be used for the lagging process
    # namesSec: names to be plotted on a secondary y-axes
    # scaleSec: scale to be used for the secondary axis 
    #           namesSec/scaleSec should be similar in magnitude to namesPlot
    # plotTitle: plot title to be used (NULL means none)
    # plotSub: plot subtitle to be used (NULL means none)
    # plotCaption: plot caption to be used (NULL means none)
    # primYLab: primary y label (NULL means use mapper)
    # secYLab: secondary y label (default is "Caution, different metric and scale")
    # facetFixed: boolean, if TRUE scales="fixed", if FALSE scales="free_y"
    #             only relevant if length(keyStates) > 1
    # mapper: mapping file for variable names to descriptive names
    # combStates: states that should be combined together for plotting (named vector, c("state"="newName"))
    # popData: a population file for combining states
    # yValPerCap: boolean, is the y-value of type per-capita?
    # printPlot: boolean, whether to print the plots
    # ...: other arguments to be passed to alignCurves()

    # Create a frame to be used by the lagging process
    tempMetrics <- plotStateMetric(df, 
                                   yVal=yVal, 
                                   namesPlot=namesPlot,
                                   keyStates=keyStates,
                                   namesSec=namesSec, 
                                   scaleSec=scaleSec,
                                   plotTitle=plotTitle, 
                                   plotSub=plotSub, 
                                   plotCaption=plotCaption,
                                   primYLab=primYLab,
                                   secYLab=secYLab,
                                   facetFixed=facetFixed,
                                   mapper=mapper,
                                   combStates=combStates,
                                   popData=popData,
                                   yValPerCap=yValPerCap,
                                   printPlot=printPlot,
                                   returnData=TRUE  # the data must be returned for the next function
                                   )

    # Run the lagging process    
    tempLM <- alignCurves(tempMetrics, valueMetric=lagValueMetric, depName=lagDepName, ...)
    
    # Return the key values
    list(dfList=tempMetrics, lmList=tempLM)
    
}


# Example for northeastern states
neCurveList <- createAndAlignCurves(stateData, 
                                    yVal="vpm7", 
                                    namesPlot=c("CTP_cases"),
                                    namesSec=c("CTP_deaths"), 
                                    keyStates=c("NY", "NJ", "MA", "CT", "RI", "NH", "VT", "ME", "DE", "DC"),
                                    combStates=c("MA"="S NE", "CT"="S NE", "RI"="S NE", 
                                                 "NH"="N NE", "VT"="N NE", "ME"="N NE", 
                                                 "NY"="NY/NJ", "NJ"="NY/NJ", 
                                                 "DE"="DE/DC", "DC"="DE/DC"
                                                 ),
                                    plotTitle="2020 coronavirus burden per million per day (select states)", 
                                    plotSub="Cases on main y-axis, deaths on secondary y-axis", 
                                    primYLab="Cases per million (7-day rolling mean)",
                                    secYLab="Deaths per million (7-day rolling mean)",
                                    facetFixed=TRUE, 
                                    popData=usafPrepped$usafDemo, 
                                    printPlot=TRUE,
                                    lagValueMetric="vpm7", 
                                    lagDepName="CTP_deaths", 
                                    lagsTry=0:30
                                    )
## 
## Will scale by: 0.07975869

# Example for midwestern states
mwCurveList <- createAndAlignCurves(stateData, 
                                    yVal="vpm7", 
                                    namesPlot=c("CTP_cases"),
                                    namesSec=c("CTP_deaths"), 
                                    keyStates=c("OH", "MI", "IN", "IL"),
                                    plotTitle="2020 coronavirus burden per million per day (select states)", 
                                    plotSub="Cases on main y-axis, deaths on secondary y-axis", 
                                    primYLab="Cases per million (7-day rolling mean)",
                                    secYLab="Deaths per million (7-day rolling mean)",
                                    facetFixed=TRUE, 
                                    popData=usafPrepped$usafDemo, 
                                    printPlot=TRUE,
                                    lagValueMetric="vpm7", 
                                    lagDepName="CTP_deaths", 
                                    lagsTry=0:30
                                    )
## 
## Will scale by: 0.02321167

## Warning: Removed 3 row(s) containing missing values (geom_path).

The midwest is challenging to align. If using a single value for lag and a single value for CFR (case fatality rate), then predictions will have far too few deaths in the early months and far too many deaths in the later months.

The changes in CFR over time can also be estimated:

# Updated for automatic lag time assessment
assessStateCFR <- function(lst, 
                           keyStates, 
                           depVar, 
                           indepVar,
                           depTitleName, 
                           indepTitleName,
                           keyMetric="vpm7", 
                           lagEarlyDate=as.Date("2020-03-31"), 
                           lagMidDate=NULL,
                           lagLateDate=as.Date("2020-10-15"), 
                           lagEarlyValue=10, 
                           lagLateValue=20, 
                           lagsTry=0:30
                           ) {
    
    # FUNCTION ARGUMENTS:
    # lst: A list such as produced by createAndAlignCurves()
    # keyStates: The key states to be extracted from the list
    # depVar: the dependent variable
    # indepVar: the independent variable
    # depTitleName: the name for the dependent variable in the title
    # indepTitleName: the name for the independent variable in the plot title
    # keyMetric: the name of the key metric that is being assessed
    # lagEarlyDate: the date for the earliest lagging calculation (dates before this will be at lagEarlyValue)
    # lagMidDate: if lags are found from data, what midpoint should be used to split data as early vs late?
    #             NULL means midway between lagEarlyDate and lagLateDate
    # lagLateDate: the date for the latest lagging calculation (dates after this will be at lagLateValue)
    # lagEarlyValue: the value for lag on lagEarlyDate, will be linearly interpolated to lagLateValue/Date
    #                NULL means calculate from data and may differ by state
    # lagLateValue: the value for lag on lagLateDate, will be linearly interpolated from lagEarlyValue/Date
    #               NULL means estimate from data and may differ by state
    # lagsTry: the values for lag to be attempted if lageEarlyValue and/or lagLateValue is NULL
    
    # Extract the data for keyStates
    df <- lst[["dfList"]] %>%
        filter(state %in% keyStates, !is.na(get(keyMetric))) %>%
        pivot_wider(names_from="name", values_from=keyMetric)

    # Function for finding lag time correlations
    helperLagCor <- function(lt, lf, dp, id) {
        lf %>%
            group_by(state) %>%
            mutate(y=get(dp), x=lag(get(id), lt)) %>%
            summarize(p=cor(x, y, use="complete.obs"), .groups="drop_last") %>%
            ungroup() %>%
            mutate(lag=lt)
    }
    
    # Middle date for splitting data
    if (is.null(lagMidDate)) lagMidDate <- mean(c(lagEarlyDate, lagLateDate))
    
    # Get the early lags from the data
    eLag <- map_dfr(.x=lagsTry, .f=helperLagCor, lf=filter(df, date<=lagMidDate), dp=depVar, id=indepVar) %>%
        group_by(state) %>%
        filter(p==max(p)) %>%
        filter(row_number()==1) %>%
        ungroup() %>%
        select(state, earlyLag=lag)
    
    # Get the late lags from the data
    lLag <- map_dfr(.x=lagsTry, .f=helperLagCor, lf=filter(df, date>lagMidDate), dp=depVar, id=indepVar) %>%
        group_by(state) %>%
        filter(p==max(p)) %>%
        filter(row_number()==1) %>%
        ungroup() %>%
        select(state, lateLag=lag)
    
    # Create the full lag frame, including substituting the fixed value(s) if provided
    lagFrame <- eLag %>%
        inner_join(lLag, by="state")
    if (!is.null(lagEarlyValue)) lagFrame <- lagFrame %>% mutate(earlyLag=lagEarlyValue)
    if (!is.null(lagLateValue)) lagFrame <- lagFrame %>% mutate(lateLag=lagLateValue)
    print(lagFrame)
    
    # Apply the assumed lagging information
    fullTime <- as.integer(lagLateDate-lagEarlyDate)
    df <- df %>%
        left_join(lagFrame, by="state") %>%
        arrange(state, date) %>%
        group_by(state) %>%
        mutate(eLag=lag(get(indepVar), mean(earlyLag)), 
               lLag=lag(get(indepVar), mean(lateLag)), 
               pctEarly=pmin(pmax(as.integer(lagLateDate-date)/fullTime, 0), 1), 
               x=ifelse(is.na(eLag), NA, pctEarly*eLag + (1-pctEarly)*ifelse(is.na(lLag), 0, lLag)), 
               y=get(depVar),
               mon=factor(month.abb[lubridate::month(date)], levels=month.abb)
               ) %>%
        filter(!is.na(x)) %>%
        ungroup()

    # Regression for data from keyStates
    if (length(keyStates) > 1) stateLM <- lm(y ~ x:mon:state + 0, data=df, na.action=na.exclude) 
    else stateLM <- lm(y ~ x:mon + 0, data=df, na.action=na.exclude)
    
    # Add the predicted value to df
    df <- df %>%
        mutate(pred=predict(stateLM))

    # Plot of curve overlaps
    p1 <- df %>%
        select(state, date, y, pred) %>%
        pivot_longer(-c(state, date)) %>%
        ggplot(aes(x=date, y=value)) + 
        geom_line(aes(color=c("pred"="Predicted", "y"="Actual")[name], group=name)) + 
        scale_x_date(date_breaks="1 month", date_labels="%b") + 
        labs(x="", 
             y=stringr::str_to_title(depTitleName), 
             title=paste0("Predicted vs. actual ", depTitleName)
             ) +
        scale_color_discrete("Metric") +
        facet_wrap(~state)
    print(p1)
    
    # Plot of rate by month
    p2 <- coef(stateLM) %>%
        as.data.frame() %>%
        purrr::set_names("CFR") %>%
        tibble::rownames_to_column("monState") %>%
        mutate(mon=factor(stringr::str_replace_all(monState, pattern="x:mon|:state.+", replacement=""), 
                          levels=month.abb
                          ), 
               state=if (length(keyStates)==1) keyStates 
                     else stringr::str_replace_all(monState, pattern="x:mon[A-Za-z]{3}:state", replacement="")
               ) %>%
        left_join(lagFrame, by="state") %>%
        ggplot(aes(x=mon, y=CFR)) + 
        geom_col(fill="lightblue") + 
        geom_text(aes(y=CFR/2, label=paste0(round(100*CFR, 1), "%"))) +
        geom_text(data=~filter(., mon==month.abb[lubridate::month(lagMidDate)]), 
                  aes(x=-Inf, y=Inf, label=paste0("Early Lag: ", earlyLag)), 
                  hjust=0, 
                  vjust=1
                  ) + 
        geom_text(data=~filter(., mon==month.abb[lubridate::month(lagMidDate)]), 
                  aes(x=Inf, y=Inf, label=paste0("Late Lag: ", lateLag)), 
                  hjust=1, 
                  vjust=1
                  ) + 
        labs(x="", 
             y=paste0(stringr::str_to_title(depTitleName), " as percentage of lagged ", indepTitleName), 
             title=paste0(stringr::str_to_title(depTitleName), 
                          " vs. lagged ", 
                          indepTitleName, 
                          " in state(s): ", 
                          paste0(keyStates, collapse=", ")
                          ), 
             subtitle=paste0("Assumed early lag on ", 
                             lagEarlyDate,
                             " interpolated to late lag on ", 
                             lagLateDate
                             ), 
             caption="Linear model coefficients on lagged data with no intercept used to estimate percentage"
             ) + 
        facet_wrap(~state)
    print(p2)

    # Return the data frame
    df
    
}

# Deaths vs. cases in Michigan
mwOut <- assessStateCFR(mwCurveList, 
                        keyStates=c("MI", "IL", "IN"), 
                        depVar="CTP_deaths", 
                        indepVar="CTP_cases", 
                        depTitleName="deaths", 
                        indepTitleName="cases", 
                        lagEarlyValue=NULL,
                        lagLateValue=NULL
                        )
## Note: Using an external vector in selections is ambiguous.
## i Use `all_of(keyMetric)` instead of `keyMetric` to silence this message.
## i See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This message is displayed once per session.
## # A tibble: 3 x 3
##   state earlyLag lateLag
##   <chr>    <int>   <int>
## 1 IN           2       2
## 2 IL           5       3
## 3 MI          10       0

# Deaths vs. cases in NY/NJ and S NE
neOut <- assessStateCFR(neCurveList, 
                        keyStates=c("NY/NJ", "S NE"), 
                        depVar="CTP_deaths", 
                        indepVar="CTP_cases", 
                        depTitleName="deaths", 
                        indepTitleName="cases", 
                        lagEarlyValue=NULL,
                        lagLateValue=NULL
                        )
## # A tibble: 2 x 3
##   state earlyLag lateLag
##   <chr>    <int>   <int>
## 1 NY/NJ        6      24
## 2 S NE         6       0

## Warning: Removed 4 rows containing missing values (position_stack).
## Warning: Removed 4 rows containing missing values (geom_text).

The CFR declines in more recent months, possibly as a function of a greater number of tests finding less serious disease. Lag times are variable but typically seem to be around a week. The use of both lag times and variable CFR by month introduces some risk of over-fitting.

The process for investigating lags and leads can also be refreshed with new data:

lagVectorWindows <- function(v1, 
                             v2, 
                             lagsTry=0:30, 
                             windowSize=30, 
                             minNoNA=ceiling(windowSize/2), 
                             updateStatus=FALSE, 
                             isLag=TRUE
                             ) {
    
    # FUNCTION ARGUMENTS:
    # v1: the first vector, which will be used 'as is'
    # v2: the second vector, which will be lagged/led by various values for lagsTry
    # lagsTry: the values for x that will be used in cor(v1, lag/lead(v2, x))
    # windowSize: the size of the window to use in taking snippets of v1 and lagged/led v2
    # minNoNA: minimum number of non-NA lagged/led values needed to calculate a correlation
    # updateStates: whether to print which window is being worked on
    # isLag: boolean, should a lag or a lead be applied (TRUE is lag, FALSE is lead)
    
    # Find the function to be used
    func <- if (isLag) lag else lead
    
    # Find the list of all possible window start points
    windowStarts <- 1:(length(v1)-windowSize+1)
    
    # Helper function to create a frame of correlations
    helperCorr <- function(s) {
        
        # Create the end point for the vector
        e <- s + windowSize - 1
        
        # Announce the start
        if (updateStatus) cat("\nProcessing window starting at:", s)
        
        # Create the correlations tibble for all values of lag, and return
        tibble::tibble(startWindow=s, endWindow=e, lags=lagsTry) %>%
            mutate(na1=sum(is.na(v1[s:e])), 
                   na2=sapply(lags, FUN=function(x) sum(is.na(func(v2, x)[s:e]))), 
                   p=sapply(lags, 
                            FUN=function(x) { 
                                ifelse(sum(!is.na(func(v2, x)[s:e])) < minNoNA,
                                       NA, 
                                       cor(v1[s:e], func(v2, x)[s:e], use="complete.obs")
                                       ) 
                                } 
                            )
                   )
    }

    # Bind the correlations frames and return
    map_dfr(windowStarts, .f=helperCorr)
    
}



# Function to assess correlations by lag/lead and window by state
stateCorr <- function(lst, 
                      keyState, 
                      met="vpm7", 
                      v1Name="CTP_deaths", 
                      v2Name="CTP_cases", 
                      windowSize=42, 
                      isLag=TRUE
                      ) {
    
    # FUNCTION ARGUMENTS:
    # lst: the processed list
    # keyState: the state of interest
    # met: the metric of interest
    # v1Name: the name of the first vector (this is considered fixed)
    # v2Name: the name of the second vector (this will have the lead/lag applied to it)
    # windowSize: number of days in the window
    # isLag: boolean, whether to use lag (TRUE) or lead (FALSE) on v2Name
    
    # Extract the data for the key State
    df <- lst[["dfList"]] %>%
        filter(state %in% keyState, !is.na(get(met))) %>%
        arrange(state, date, name)
    
    # Get the minimum date that is common to both
    minDate <- df %>%
        group_by(name) %>%
        summarize(date=min(date), .groups="drop_last") %>%
        pull(date) %>%
        max()
    
    # Extract v1 and v2
    v1 <- df %>% 
        filter(name==v1Name, date>=minDate) %>% 
        pull(met)
    v2 <- df %>% 
        filter(name==v2Name, date>=minDate) %>% 
        pull(met)

    # Confirm that dates are the same for both vectors
    dfDates1 <- df %>% filter(name==v1Name, date>=minDate) %>% pull(date)
    dfDates2 <- df %>% filter(name==v2Name, date>=minDate) %>% pull(date)
    if (!all.equal(dfDates1, dfDates2)) stop("\nDate mismatch\n")

    # Find the lags in the data
    dfLags <- lagVectorWindows(v1, v2, lagsTry=0:30, windowSize=windowSize, isLag=isLag) %>%
        mutate(windowStartDate=dfDates1[startWindow])

    # Give the description of the lag or lead
    descr <- ifelse(isLag, "lag (days)", "lead (days)")
    
    # Boxplot of correlations by lag
    p1 <- dfLags %>%
        filter(!is.na(p)) %>%
        ggplot(aes(x=factor(lags), y=p)) + 
        geom_boxplot(fill="lightblue") + 
        labs(x=stringr::str_to_title(descr), 
             y="Correlation", 
             title=paste0("Box plot of correlation by ", descr)
             )
    print(p1)

    # Plot of best lags by starting date
    p2 <- dfLags %>%
        filter(!is.na(p)) %>%
        group_by(startWindow) %>%
        filter(p==max(p)) %>%
        ggplot(aes(x=windowStartDate, y=lags)) + 
        geom_point(aes(size=p)) + 
        labs(x="Window start date", 
             y=paste0("Best ", descr), 
             title=paste0("Best ", descr, " by window starting date")
             ) + 
        scale_size_continuous(paste0("p at best ", stringr::str_replace(descr, " .*", ""))) + 
        scale_x_date(date_breaks="1 month", date_labels="%b")
    print(p2)
    
    # Plot of correlations by lag
    p3 <- dfLags %>%
        filter(!is.na(p)) %>%
        ggplot(aes(x=windowStartDate, y=lags)) + 
        geom_tile(aes(fill=p)) + 
        labs(x="Window start date", 
             y=stringr::str_to_title(descr), 
             title=paste0(stringr::str_to_title(descr), " by window starting date")
             ) + 
        scale_color_continuous(paste0("p at ", stringr::str_replace(descr, " .*", ""))) + 
        scale_x_date(date_breaks="1 month", date_labels="%b")
    print(p3)

    # Rename variable lags to leads if isLag is FALSE
    if (isFALSE(isLag)) dfLags <- dfLags %>%
        rename(leads=lags)
    
    # Return dfLags
    dfLags
    
}


miLeadData <- stateCorr(mwCurveList, keyState="MI", v1Name="CTP_cases", v2Name="CTP_deaths", isLag=FALSE)

nynjLeadData <- stateCorr(neCurveList, keyState="NY/NJ", v1Name="CTP_cases", v2Name="CTP_deaths", isLag=FALSE)

And the full process can be run for the state of Texas:

# Example for southern states
soCurveList <- createAndAlignCurves(stateData, 
                                    yVal="vpm7", 
                                    namesPlot=c("CTP_cases"),
                                    namesSec=c("CTP_deaths"), 
                                    keyStates=c("AZ", "FL", "GA", "TX"),
                                    plotTitle="2020 coronavirus burden per million per day (select states)", 
                                    plotSub="Cases on main y-axis, deaths on secondary y-axis", 
                                    primYLab="Cases per million (7-day rolling mean)",
                                    secYLab="Deaths per million (7-day rolling mean)",
                                    facetFixed=TRUE, 
                                    popData=usafPrepped$usafDemo, 
                                    printPlot=TRUE,
                                    lagValueMetric="vpm7", 
                                    lagDepName="CTP_deaths", 
                                    lagsTry=0:30
                                    )
## 
## Will scale by: 0.02108297

## Warning: Removed 3 row(s) containing missing values (geom_path).

assessStateCFR(soCurveList, 
               keyStates=c("AZ", "FL", "GA", "TX"), 
               depVar="CTP_deaths", 
               indepVar="CTP_cases", 
               depTitleName="deaths", 
               indepTitleName="cases"
               )
## # A tibble: 4 x 3
##   state earlyLag lateLag
##   <chr>    <dbl>   <dbl>
## 1 GA          10      20
## 2 TX          10      20
## 3 AZ          10      20
## 4 FL          10      20

## Warning: Removed 4 rows containing missing values (position_stack).
## Warning: Removed 4 rows containing missing values (geom_text).

## # A tibble: 967 x 13
##    state date       CTP_cases CTP_deaths earlyLag lateLag   eLag  lLag pctEarly
##    <chr> <date>         <dbl>      <dbl>    <dbl>   <dbl>  <dbl> <dbl>    <dbl>
##  1 AZ    2020-03-17      1.17     0            10      20 0.0837    NA        1
##  2 AZ    2020-03-18      1.92     0.0209       10      20 0.146     NA        1
##  3 AZ    2020-03-19      2.93     0.0418       10      20 0.146     NA        1
##  4 AZ    2020-03-20      5.17     0.0418       10      20 0.126     NA        1
##  5 AZ    2020-03-21      7.05     0.105        10      20 0.146     NA        1
##  6 AZ    2020-03-22      8.83     0.126        10      20 0.146     NA        1
##  7 AZ    2020-03-23     11.2      0.167        10      20 0.272     NA        1
##  8 AZ    2020-03-24     14.0      0.272        10      20 0.293     NA        1
##  9 AZ    2020-03-25     16.1      0.293        10      20 0.398     NA        1
## 10 AZ    2020-03-26     16.0      0.314        10      20 0.732     NA        1
## # ... with 957 more rows, and 4 more variables: x <dbl>, y <dbl>, mon <fct>,
## #   pred <dbl>
txLeadData <- stateCorr(soCurveList, keyState="TX", v1Name="CTP_cases", v2Name="CTP_deaths", isLag=FALSE)

The process for assessing integrated data is converted to a main function:

integrateStateData <- function(stateData=NULL, 
                               popData=NULL,
                               ctpList=NULL,
                               usafData=NULL,
                               cdcList=NULL, 
                               glimpseStateData=NULL, 
                               runAll=FALSE,
                               runTwoAxis=runAll, 
                               yVal="vpm7", 
                               var1="CTP_cases", 
                               var2="CTP_deaths", 
                               keyStates=sort(c(state.abb, "DC")), 
                               combStates=vector("character", 0), 
                               mapper=varMapper, 
                               runCreateAlign=runAll, 
                               lagsTry=0:30, 
                               runCFR=runAll, 
                               lagEarlyValue=10, 
                               lagLateValue=20, 
                               lagEarlyDate=as.Date("2020-03-31"), 
                               lagMidDate=NULL,
                               lagLateDate=as.Date("2020-10-15")
                               ) {
    
    # FUNCTION ARGUMENTS:
    # stateData: an integrated state-level data file (NULL means create from components)
    # popData: a state-level population data file (must be passed if stateData is not NULL)
    # ctpList: a processed list of COVID Tracking Project data (must be provided if stateData=NULL)
    # usafData: a processed tibble of USA Facts data (must be provided if stateData=NULL)
    # cdcList: a processed list of CDC All-Cause deaths data (must be provided if stateData=NULL)
    # glimpseStateData: boolean, whether to glimpse the stateData file (NULL means only if from components)
    # runAll: boolean, whether to set al of runTwoAxis, runCreateAlign, and runCFR all to the same value
    # runTwoAxis: whether to show a plot of two metrics on two axes
    # yVal: the y-value to use from stateData
    # var1: the first of the two variables of interest (should be the leading component if a drives b)
    # var2: the second of the two variables of interest (should be the lagging component if a drives b)
    # keyStates: subset of states to use for analysis
    # combStates: states that should be combined together for plotting (named vector, c("state"="newName"))
    # mapper: mapping file for variables to descriptive names
    # runCreateAlign: boolean, should createAndAlignCurves() be run?
    # lagsTry: lag values to try (applies to both createAlignCurves and assessStateCFR)
    # runCFR: boolean, should assessStateCFR be run?
    # lagEarlyValue: the value for lag on lagEarlyDate, will be linearly interpolated to lagLateValue/Date
    #                NULL means calculate from data and may differ by state
    # lagLateValue: the value for lag on lagLateDate, will be linearly interpolated from lagEarlyValue/Date
    #               NULL means estimate from data and may differ by state
    # lagEarlyDate: the date for the earliest lagging calculation (dates before this will be at lagEarlyValue)
    # lagMidDate: if lags are found from data, what midpoint should be used to split data as early vs late?
    #             NULL means midway between lagEarlyDate and lagLateDate
    # lagLateDate: the date for the latest lagging calculation (dates after this will be at lagLateValue)
    
    # Check that either stateData or its components have been provided
    if (!is.null(stateData)) {
        cat("\nA file has been passed for stateData, components will be ignored\n")
        if (is.null(popData)) stop("Must also pass a popData file of state-level population\n")
        if (is.null(glimpseStateData)) glimpseStateData <- FALSE
    } else {
        if (is.null(ctpList) | is.null(usafData) | is.null(cdcList)) {
            stop("\nMust provided all of ctpList, usafData, cdcList when stateData is NULL\n")
        }
        cat("\nBuilding stateData from the passed components\n")
        if (is.null(glimpseStateData)) glimpseStateData <- TRUE
        
        # Create COVID Tracking Project File
        ctpPrepped <- prepCTPData(ctpList)
        
        # Create USA Facts file
        usafPrepped <- prepUSAFData(usafData)
        
        # Create an integrated state population demographics file
        demoData <- ctpPrepped$ctpDemo %>%
            rename(popCTP=pop) %>%
            full_join(rename(usafPrepped$usafDemo, popUSAF=pop), by="state") %>%
            mutate(pop=pmax(popCTP, popUSAF))
        
        # Create CDC All-Cause File
        cdcPrepped <- prepCDCData(cdcList, popData=demoData)

        # Integrated state data
        stateData <- ctpPrepped$ctpData %>%
            bind_rows(usafPrepped$usafData) %>%
            bind_rows(cdcPrepped$cdcData)
        
        # Create popData if not provided
        if (is.null(popData)) popData <- demoData %>% select(state, pop)
        
    }
    
    # Show summaries of stateData if requested
    if (glimpseStateData) {
        # Glimpse the file
        glimpse(stateData)
        # Show control totals
        stateData %>%
            group_by(name) %>%
            summarize(value=sum(value, na.rm=TRUE), value7=sum(value7, na.rm=TRUE), .groups="drop_last") %>%
            print()
    }
    
    # Run plotStateMetric if requested
    if (runTwoAxis) {
        caseDeath <- plotStateMetric(stateData, 
                                     yVal=yVal, 
                                     namesPlot=var1,
                                     namesSec=var2, 
                                     keyStates=keyStates,
                                     combStates=combStates,
                                     plotTitle="2020 coronavirus burden per million per day (select states)", 
                                     plotSub="Caution that metrics are on different axes and scales",
                                     primYLab=paste0(mapper[var1], "\n", mapper[yVal]), 
                                     secYLab=paste0(mapper[var2], "\n", mapper[yVal]), 
                                     facetFixed=TRUE, 
                                     popData=popData,
                                     returnData=TRUE
                                     )
    } else {
        caseDeath <- NULL
    }
 
    # Run createAndAlignCurves() if requested
    if (runCreateAlign) {
        curveList <- createAndAlignCurves(stateData, 
                                          yVal=yVal, 
                                          namesPlot=var1,
                                          namesSec=var2, 
                                          keyStates=keyStates,
                                          combStates=combStates,
                                          plotTitle="2020 coronavirus burden per million per day (select states)", 
                                          plotSub="Caution that metrics are on different axes and scales",
                                          primYLab=paste0(mapper[var1], "\n", mapper[yVal]), 
                                          secYLab=paste0(mapper[var2], "\n", mapper[yVal]), 
                                          facetFixed=TRUE, 
                                          popData=popData,
                                          printPlot=TRUE,
                                          lagValueMetric=yVal, 
                                          lagDepName=var2, 
                                          lagsTry=lagsTry
                                          )
        
    } else {
        curveList <- NULL
    }
    
    # Run assessStateCFR() if requested (requires that curveList have been created)
    if (runCFR & is.null(curveList)) {
        cat("\nassessStateCFR requires runCreateAlign=TRUE; skipping and setting runCFR=FALSE\n")
        runCFR <- FALSE
    }
    if (runCFR) {
        # Create the list of key states based on keyStates and combStates
        useStates <- keyStates
        for (ctr in 1:length(useStates)) { 
            if (useStates[ctr] %in% names(combStates)) useStates[ctr] <- combStates[useStates[ctr]]
        }
        useStates <- unique(useStates)
        # Run assessStateCFR
        cfrList <- assessStateCFR(curveList, 
                                  keyStates=useStates, 
                                  depVar=var2, 
                                  indepVar=var1, 
                                  depTitleName=stringr::str_replace(var2, ".*_", ""), 
                                  indepTitleName=stringr::str_replace(var1, ".*_", ""), 
                                  keyMetric=yVal,
                                  lagEarlyDate=lagEarlyDate,
                                  lagMidDate=lagMidDate,
                                  lagLateDate=lagLateDate,
                                  lagEarlyValue=lagEarlyValue,
                                  lagLateValue=lagLateValue,
                                  lagsTry=lagsTry
                                  )
    } else {
        cfrList <- NULL
    }
    
    # Return a list of the key data
    list(stateData=stateData, popData=popData, caseDeath=caseDeath, curveList=curveList, cfrList=cfrList)
    
}


# Example for creating a full stateData file
fullStateList <- integrateStateData(ctpList=readFromRDS("test_old_201108"), 
                                    usafData=readFromRDS("cty_old_20201109")$clusterStateData, 
                                    cdcList=readFromRDS("cdcList_20201110")
                                    )
## 
## Building stateData from the passed components
## Rows: 104,103
## Columns: 9
## $ state  <chr> "AK", "AK", "AK", "AK", "AK", "AK", "AK", "AK", "AK", "AK", ...
## $ date   <date> 2020-03-06, 2020-03-07, 2020-03-08, 2020-03-09, 2020-03-10,...
## $ metric <chr> "cases", "cases", "cases", "cases", "cases", "cases", "cases...
## $ source <chr> "CTP", "CTP", "CTP", "CTP", "CTP", "CTP", "CTP", "CTP", "CTP...
## $ name   <chr> "CTP_cases", "CTP_cases", "CTP_cases", "CTP_cases", "CTP_cas...
## $ value  <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 2, 3, 0, 6, 2, 8, 0, 14, 6,...
## $ value7 <dbl> NA, NA, NA, 0.0000000, 0.1428571, 0.1428571, 0.1428571, 0.14...
## $ vpm    <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, ...
## $ vpm7   <dbl> NA, NA, NA, 0.0000000, 0.1934601, 0.1934601, 0.1934601, 0.19...
## # A tibble: 8 x 3
##   name             value     value7
##   <chr>            <dbl>      <dbl>
## 1 CDC_deaths    2224832.   2173735.
## 2 CDC_excess     242466.    240886.
## 3 CTP_cases     9717214    9372412.
## 4 CTP_deaths     228262     224869.
## 5 CTP_hosp      9301372    9036822.
## 6 CTP_tests   154892254  150735326.
## 7 USAF_cases    9679664    9336904.
## 8 USAF_deaths    233286     230182.
fullStateList
## $stateData
## # A tibble: 104,103 x 9
##    state date       metric source name      value value7   vpm   vpm7
##    <chr> <date>     <chr>  <chr>  <chr>     <dbl>  <dbl> <dbl>  <dbl>
##  1 AK    2020-03-06 cases  CTP    CTP_cases     0 NA      0    NA    
##  2 AK    2020-03-07 cases  CTP    CTP_cases     0 NA      0    NA    
##  3 AK    2020-03-08 cases  CTP    CTP_cases     0 NA      0    NA    
##  4 AK    2020-03-09 cases  CTP    CTP_cases     0  0      0     0    
##  5 AK    2020-03-10 cases  CTP    CTP_cases     0  0.143  0     0.193
##  6 AK    2020-03-11 cases  CTP    CTP_cases     0  0.143  0     0.193
##  7 AK    2020-03-12 cases  CTP    CTP_cases     0  0.143  0     0.193
##  8 AK    2020-03-13 cases  CTP    CTP_cases     1  0.143  1.35  0.193
##  9 AK    2020-03-14 cases  CTP    CTP_cases     0  0.429  0     0.580
## 10 AK    2020-03-15 cases  CTP    CTP_cases     0  0.857  0     1.16 
## # ... with 104,093 more rows
## 
## $popData
## # A tibble: 51 x 2
##    state      pop
##    <chr>    <dbl>
##  1 AK      738432
##  2 AL     4858979
##  3 AR     2978204
##  4 AZ     6828065
##  5 CA    39144818
##  6 CO     5456574
##  7 CT     3590886
##  8 DC      672228
##  9 DE      945934
## 10 FL    20271272
## # ... with 41 more rows
## 
## $caseDeath
## NULL
## 
## $curveList
## NULL
## 
## $cfrList
## NULL
# Example for using the full stateData file
nenynjList <- integrateStateData(stateData=fullStateList$stateData, 
                                 popData=fullStateList$popData,
                                 runAll=TRUE,
                                 keyStates=c("NY", "NJ", "MA", "CT", "RI", "NH", "VT", "ME"),
                                 combStates=c("MA"="S NE", "CT"="S NE", "RI"="S NE", 
                                              "NH"="N NE", "VT"="N NE", "ME"="N NE"
                                              ), 
                                 lagEarlyValue=NULL,
                                 lagLateValue=NULL
                                 )
## 
## A file has been passed for stateData, components will be ignored
## 
## Will scale by: 0.07901078

## 
## Will scale by: 0.07901078

## # A tibble: 4 x 3
##   state earlyLag lateLag
##   <chr>    <int>   <int>
## 1 N NE         3      27
## 2 NY           6      17
## 3 S NE         6       0
## 4 NJ           8      30

## Warning: Removed 8 rows containing missing values (position_stack).
## Warning: Removed 8 rows containing missing values (geom_text).

# Example for alignment between USA Facts and CTP
testList <- integrateStateData(stateData=fullStateList$stateData, 
                               popData=fullStateList$popData,
                               runAll=TRUE,
                               keyStates=c("NY", "NJ", "MA", "CT", "RI", "NH", "VT", "ME"),
                               combStates=c("MA"="S NE", "CT"="S NE", "RI"="S NE", 
                                            "NH"="N NE", "VT"="N NE", "ME"="N NE"
                                            ), 
                               var1="CTP_deaths", 
                               var2="USAF_deaths",
                               lagEarlyValue=NULL,
                               lagLateValue=NULL
                               )
## 
## A file has been passed for stateData, components will be ignored
## 
## Will scale by: 1.281759

## 
## Will scale by: 1.281759

## # A tibble: 4 x 3
##   state earlyLag lateLag
##   <chr>    <int>   <int>
## 1 N NE         0       0
## 2 NJ           0       0
## 3 S NE         1       2
## 4 NY           2       4
## Warning: Removed 8 row(s) containing missing values (geom_path).

## Warning: Removed 8 rows containing missing values (position_stack).

## Warning: Removed 8 rows containing missing values (geom_text).

The differences in data reporting between CTP and USA Facts stand out. Lags are generally zero, though the spikes mean that each source has a different cumulative number of deaths. The process can similarly be run for the Great Lakes states (excluding Pennsylvania and New York):

# Example for using the full stateData file
glakesList <- integrateStateData(stateData=fullStateList$stateData, 
                                 popData=fullStateList$popData,
                                 runAll=TRUE,
                                 keyStates=c("MN", "WI", "IL", "IN", "MI", "OH"),
                                 lagEarlyValue=NULL,
                                 lagLateValue=NULL
                                 )
## 
## A file has been passed for stateData, components will be ignored
## 
## Will scale by: 0.01614313

## 
## Will scale by: 0.01614313

## Warning: Removed 3 row(s) containing missing values (geom_path).

## # A tibble: 6 x 3
##   state earlyLag lateLag
##   <chr>    <int>   <int>
## 1 IN           2       2
## 2 MN           3      13
## 3 WI           3      30
## 4 IL           5       3
## 5 MI          10       0
## 6 OH          11      19

Creating separate lags by state as well as separate CFR by month is important in the Great Lakes. Michigan had a very large spike early when there was little case-death lag and a high CFR. By contrast, Wisconsin had a large late spike when there was more meaningful case-death lag and a much lower CFR. Indiana especially needs the variation by month as its early spike had ~6% CFR and its late spike had ~1% CFR.

Functions have been updated so that .groups=“drop_last” in every summarize() call to avoid the note about dplyr default behavior. Further, the lm() has been updated to na.action=na.exclude so that predict() will return NA for all rows with missing data (required for hospital data which has variable date of entry by state).

Suppose that the goal is to check how hospitalizations and deaths correlate:

# Example for using the full stateData file
glakesHospList <- integrateStateData(stateData=fullStateList$stateData, 
                                     popData=fullStateList$popData,
                                     runAll=TRUE,
                                     keyStates=c("MN", "WI", "IL", "IN", "MI", "OH"),
                                     var1="CTP_hosp",
                                     var2="CTP_deaths",
                                     lagEarlyValue=NULL,
                                     lagLateValue=NULL
                                     )
## 
## A file has been passed for stateData, components will be ignored
## 
## Will scale by: 0.04114457

## 
## Will scale by: 0.04114457

## Warning: Removed 3 row(s) containing missing values (geom_path).

## # A tibble: 6 x 3
##   state earlyLag lateLag
##   <chr>    <int>   <int>
## 1 MN           0      11
## 2 MI           3       5
## 3 WI           3      15
## 4 IN           5       3
## 5 OH           9      24
## 6 IL          10       0

## Warning: Removed 7 rows containing missing values (position_stack).
## Warning: Removed 7 rows containing missing values (geom_text).

Even with total hospitalized being a state variable rather than a flow variable, it appears to be a better predicor of changes in deaths in the next week or so. Specifically, even when using a common 7-day lag across all states and dates, application of a single multiplier in the 2%-3% range by state appears to drive reasonably good convergence of the curves. It is a much more stable predictor over time than cases.

Suppose that the goal is to assess whether increases in CDC all-cause deaths link to increases in coronavirus deaths:

# Example for using the full stateData file
glakesCDC1List <- integrateStateData(stateData=fullStateList$stateData, 
                                     popData=fullStateList$popData,
                                     runAll=TRUE,
                                     keyStates=c("MN", "WI", "IL", "IN", "MI", "OH"),
                                     var1="CDC_excess",
                                     var2="CTP_deaths",
                                     lagEarlyValue=NULL,
                                     lagLateValue=NULL
                                     )
## 
## A file has been passed for stateData, components will be ignored
## 
## Will scale by: 0.7824127

## 
## Will scale by: 0.7824127

## Warning: Removed 59 row(s) containing missing values (geom_path).

## # A tibble: 6 x 3
##   state earlyLag lateLag
##   <chr>    <int>   <int>
## 1 MN           0       0
## 2 OH           1       2
## 3 MI           3       7
## 4 IN           5      18
## 5 WI           7       4
## 6 IL           8       0
## Warning: Removed 110 row(s) containing missing values (geom_path).

# Example for using the full stateData file
glakesCDC2List <- integrateStateData(stateData=fullStateList$stateData, 
                                     popData=fullStateList$popData,
                                     runAll=TRUE,
                                     keyStates=c("MN", "WI", "IL", "IN", "MI", "OH"),
                                     var1="CTP_deaths",
                                     var2="CDC_excess",
                                     lagEarlyValue=NULL,
                                     lagLateValue=NULL
                                     )
## 
## A file has been passed for stateData, components will be ignored
## 
## Will scale by: 1.278098

## 
## Will scale by: 1.278098

## Warning: Removed 63 row(s) containing missing values (geom_path).

## # A tibble: 6 x 3
##   state earlyLag lateLag
##   <chr>    <int>   <int>
## 1 IL           0       0
## 2 IN           0      26
## 3 MI           0      12
## 4 MN           0       4
## 5 OH           0      23
## 6 WI           0       0
## Warning: Removed 126 row(s) containing missing values (geom_path).

The link between CDC excess all-cause deaths and reported coronavirus deaths is less clear. There appear to be roughly 30% more excess all-cause deaths than reported coronavirus deaths, and with the excess all-cause deaths sometimes leading the reported coronavirus deaths (possibly due to differences in how death date is tracked). Further, Wisconsin did not have excess deaths until Fall, so the predictive nature of this metric is poor in Wisconsin.

Do increases in coronavirus cases predict increases in CDC all-cause deaths?

# Example for using the full stateData file
glakesCDC3List <- integrateStateData(stateData=fullStateList$stateData, 
                                     popData=fullStateList$popData,
                                     runAll=TRUE,
                                     keyStates=c("MN", "WI", "IL", "IN", "MI", "OH"),
                                     var1="CTP_cases",
                                     var2="CDC_excess",
                                     lagEarlyValue=NULL,
                                     lagLateValue=NULL
                                     )
## 
## A file has been passed for stateData, components will be ignored
## 
## Will scale by: 0.0206325

## 
## Will scale by: 0.0206325

## Warning: Removed 66 row(s) containing missing values (geom_path).

## # A tibble: 6 x 3
##   state earlyLag lateLag
##   <chr>    <int>   <int>
## 1 IL           0       9
## 2 IN           0      13
## 3 MN           0      15
## 4 OH           0      29
## 5 WI           0       5
## 6 MI           7      16
## Warning: Removed 126 row(s) containing missing values (geom_path).

Since excess deaths and coronavirus deaths are reasonably correlated, increases in cases help predict increases in excess all-cause deaths. The rate of excess death per case decreases significantly with time.

Full Update (mid-November)

The full process is updated with an additional two weeks of data:

# Use existing segments with updated data
locDownload <- "./RInputFiles/Coronavirus/CV_downloaded_201120.csv"
test_old_201120 <- readRunCOVIDTrackingProject(thruLabel="Nov 19, 2020", 
                                               downloadTo=if (file.exists(locDownload)) NULL else locDownload,
                                               readFrom=locDownload, 
                                               compareFile=readFromRDS("test_hier5_201025")$dfRaw,
                                               useClusters=readFromRDS("test_hier5_201025")$useClusters
                                               )
## 
## -- Column specification --------------------------------------------------------
## cols(
##   .default = col_double(),
##   state = col_character(),
##   totalTestResultsSource = col_character(),
##   dataQualityGrade = col_character(),
##   lastUpdateEt = col_character(),
##   dateModified = col_datetime(format = ""),
##   checkTimeEt = col_character(),
##   dateChecked = col_datetime(format = ""),
##   fips = col_character(),
##   hash = col_character(),
##   grade = col_logical()
## )
## i Use `spec()` for the full column specifications.
## 
## File is unique by state and date
## 
## 
## Overall control totals in file:
## # A tibble: 1 x 3
##   positiveIncrease deathIncrease hospitalizedCurrently
##              <dbl>         <dbl>                 <dbl>
## 1         11556034        243675              10193684
## 
## *** COMPARISONS TO REFERENCE FILE: compareFile
## 
## Checkin for similarity of: column names
## In reference but not in current: 
## In current but not in reference: 
## 
## Checkin for similarity of: states
## In reference but not in current: 
## In current but not in reference: 
## 
## Checkin for similarity of: dates
## In reference but not in current: 
## In current but not in reference: 2020-11-19 2020-11-18 2020-11-17 2020-11-16 2020-11-15 2020-11-14 2020-11-13 2020-11-12 2020-11-11 2020-11-10 2020-11-09 2020-11-08 2020-11-07 2020-11-06 2020-11-05 2020-11-04 2020-11-03 2020-11-02 2020-11-01 2020-10-31 2020-10-30 2020-10-29 2020-10-28 2020-10-27 2020-10-26 2020-10-25
## 
## *** Difference of at least 5 and difference is at least 1%:
## Joining, by = c("date", "name")
##           date                  name newValue oldValue
## 1   2020-03-06      positiveIncrease      116      109
## 2   2020-03-07      positiveIncrease      165      176
## 3   2020-03-10      positiveIncrease      407      387
## 4   2020-03-11      positiveIncrease      502      509
## 5   2020-03-12      positiveIncrease      686      671
## 6   2020-03-13      positiveIncrease     1055     1072
## 7   2020-03-15      positiveIncrease     1263     1291
## 8   2020-03-16      positiveIncrease     1776     1739
## 9   2020-03-18      positiveIncrease     3037     3089
## 10  2020-03-19      positiveIncrease     4706     4651
## 11  2020-03-21 hospitalizedCurrently     1492     1436
## 12  2020-03-23 hospitalizedCurrently     2812     2770
## 13  2020-03-25 hospitalizedCurrently     5140     5062
## 14  2020-03-28      positiveIncrease    19601    19925
## 15  2020-03-28         deathIncrease      551      544
## 16  2020-03-29         deathIncrease      504      515
## 17  2020-03-30      positiveIncrease    21485    22042
## 18  2020-03-31      positiveIncrease    25174    24853
## 19  2020-03-31         deathIncrease      907      890
## 20  2020-04-01      positiveIncrease    26128    25791
## 21  2020-04-04      positiveIncrease    32867    33212
## 22  2020-04-06      positiveIncrease    28410    29002
## 23  2020-04-09      positiveIncrease    35116    34503
## 24  2020-04-10      positiveIncrease    33473    34380
## 25  2020-04-10         deathIncrease     2072     2108
## 26  2020-04-11      positiveIncrease    31092    30501
## 27  2020-04-11         deathIncrease     2079     2054
## 28  2020-04-13      positiveIncrease    24384    25195
## 29  2020-04-14      positiveIncrease    26080    25719
## 30  2020-04-15      positiveIncrease    29859    30307
## 31  2020-04-16      positiveIncrease    31581    30978
## 32  2020-04-23         deathIncrease     1814     1791
## 33  2020-04-24         deathIncrease     1972     1895
## 34  2020-04-25         deathIncrease     1627     1748
## 35  2020-04-27         deathIncrease     1287     1270
## 36  2020-04-29         deathIncrease     2685     2713
## 37  2020-05-01         deathIncrease     1808     1779
## 38  2020-05-02         deathIncrease     1531     1562
## 39  2020-05-05         deathIncrease     2496     2452
## 40  2020-05-06         deathIncrease     1915     1948
## 41  2020-05-08         deathIncrease     1780     1798
## 42  2020-05-12      positiveIncrease    22559    22890
## 43  2020-05-12         deathIncrease     1505     1486
## 44  2020-05-13      positiveIncrease    21627    21285
## 45  2020-05-13         deathIncrease     1736     1704
## 46  2020-05-14         deathIncrease     1854     1879
## 47  2020-05-15      positiveIncrease    25422    24685
## 48  2020-05-15         deathIncrease     1265     1507
## 49  2020-05-16      positiveIncrease    23593    24702
## 50  2020-05-16         deathIncrease     1195      987
## 51  2020-05-18         deathIncrease      890      848
## 52  2020-05-21         deathIncrease     1426     1394
## 53  2020-05-22      positiveIncrease    24173    24433
## 54  2020-05-22         deathIncrease     1303     1341
## 55  2020-05-23      positiveIncrease    22365    21531
## 56  2020-05-23         deathIncrease     1035     1063
## 57  2020-05-24      positiveIncrease    18859    20072
## 58  2020-05-25         deathIncrease      553      559
## 59  2020-05-26         deathIncrease      673      645
## 60  2020-05-28         deathIncrease     1245     1231
## 61  2020-05-29         deathIncrease     1167     1184
## 62  2020-05-30      positiveIncrease    23437    23682
## 63  2020-05-30         deathIncrease      917      932
## 64  2020-06-02         deathIncrease     1000      962
## 65  2020-06-03      positiveIncrease    20155    20390
## 66  2020-06-03         deathIncrease      979      993
## 67  2020-06-04      positiveIncrease    20383    20886
## 68  2020-06-04         deathIncrease      868      893
## 69  2020-06-05      positiveIncrease    23065    23394
## 70  2020-06-05         deathIncrease      840      826
## 71  2020-06-06      positiveIncrease    22560    23064
## 72  2020-06-06         deathIncrease      710      728
## 73  2020-06-08         deathIncrease      679      661
## 74  2020-06-12      positiveIncrease    23095    23597
## 75  2020-06-12         deathIncrease      763      775
## 76  2020-06-15         deathIncrease      407      381
## 77  2020-06-16         deathIncrease      707      730
## 78  2020-06-17         deathIncrease      794      767
## 79  2020-06-18      positiveIncrease    27088    27746
## 80  2020-06-18         deathIncrease      690      705
## 81  2020-06-19      positiveIncrease    30960    31471
## 82  2020-06-20      positiveIncrease    31950    32294
## 83  2020-06-20         deathIncrease      611      629
## 84  2020-06-21      positiveIncrease    28848    27928
## 85  2020-06-22         deathIncrease      295      286
## 86  2020-06-23      positiveIncrease    33884    33447
## 87  2020-06-23         deathIncrease      725      710
## 88  2020-06-24         deathIncrease      706      724
## 89  2020-06-25         deathIncrease      664      647
## 90  2020-06-26         deathIncrease      625      637
## 91  2020-06-27         deathIncrease      502      511
## 92  2020-06-29         deathIncrease      358      332
## 93  2020-06-30         deathIncrease      585      596
## 94  2020-07-01         deathIncrease      688      701
## 95  2020-07-02      positiveIncrease    53508    54085
## 96  2020-07-04         deathIncrease      300      306
## 97  2020-07-06      positiveIncrease    41494    41959
## 98  2020-07-06         deathIncrease      266      243
## 99  2020-07-07         deathIncrease      904      923
## 100 2020-07-09         deathIncrease      900      867
## 101 2020-07-10         deathIncrease      822      854
## 102 2020-07-17         deathIncrease      935      951
## 103 2020-07-20         deathIncrease      368      363
## 104 2020-07-21         deathIncrease     1070     1039
## 105 2020-07-22         deathIncrease     1136     1171
## 106 2020-07-24         deathIncrease     1190     1176
## 107 2020-07-25         deathIncrease     1008     1023
## 108 2020-07-26      positiveIncrease    60123    61000
## 109 2020-08-01      positiveIncrease    60247    61101
## 110 2020-08-02         deathIncrease      492      498
## 111 2020-08-03         deathIncrease      536      519
## 112 2020-08-04         deathIncrease     1238     1255
## 113 2020-08-08      positiveIncrease    53084    53712
## 114 2020-08-10         deathIncrease      437      426
## 115 2020-08-14      positiveIncrease    57093    55636
## 116 2020-08-17      positiveIncrease    37411    37880
## 117 2020-08-17         deathIncrease      418      407
## 118 2020-08-21         deathIncrease     1108     1123
## 119 2020-08-22      positiveIncrease    45723    46236
## 120 2020-08-24      positiveIncrease    34250    34643
## 121 2020-08-24         deathIncrease      352      343
## 122 2020-08-29      positiveIncrease    43967    44501
## 123 2020-08-31         deathIncrease      380      366
## 124 2020-09-02      positiveIncrease    30217    30603
## 125 2020-09-07      positiveIncrease    28143    28682
## 126 2020-09-09         deathIncrease     1102     1084
## 127 2020-09-15      positiveIncrease    34904    35445
## 128 2020-09-15         deathIncrease     1044     1031
## 129 2020-09-16         deathIncrease     1184     1200
## 130 2020-09-19      positiveIncrease    44905    45564
## 131 2020-09-20      positiveIncrease    35503    36295
## 132 2020-09-22         deathIncrease      866      854
## 133 2020-09-23         deathIncrease     1147     1159
## 134 2020-09-27      positiveIncrease    34987    35454
## 135 2020-09-27         deathIncrease      302      307
## 136 2020-09-28      positiveIncrease    35883    36524
## 137 2020-09-28         deathIncrease      265      257
## 138 2020-09-29      positiveIncrease    36441    36947
## 139 2020-10-03         deathIncrease      733      741
## 140 2020-10-04      positiveIncrease    37988    38439
## 141 2020-10-05         deathIncrease      341      326
## 142 2020-10-11      positiveIncrease    46269    46946
## 143 2020-10-12      positiveIncrease    42643    43124
## 144 2020-10-13         deathIncrease      700      690
## 145 2020-10-14      positiveIncrease    56117    56797
## 146 2020-10-15         deathIncrease      937      951
## 147 2020-10-16         deathIncrease      891      877
## 148 2020-10-17      positiveIncrease    57330    57943
## 149 2020-10-17         deathIncrease      762      780
## 150 2020-10-18      positiveIncrease    48284    48922
## 151 2020-10-19         deathIncrease      461      456
## 152 2020-10-21      positiveIncrease    60953    58606
## 153 2020-10-22      positiveIncrease    72842    75248
## Joining, by = c("date", "name")
## Warning: Removed 26 row(s) containing missing values (geom_path).
## 
## 
## *** Difference of at least 5 and difference is at least 1%:
## Joining, by = c("state", "name")
##   state                  name newValue oldValue
## 1    AK      positiveIncrease    12523    13535
## 2    CO      positiveIncrease    93398    91570
## 3    FL      positiveIncrease   766305   776249
## 4    NM      positiveIncrease    41040    40168
## 5    NM hospitalizedCurrently    27399    27120
## 6    PR      positiveIncrease    31067    61275
## 7    RI      positiveIncrease    30581    30116
## Rows: 14,673
## Columns: 55
## $ date                        <date> 2020-11-19, 2020-11-19, 2020-11-19, 20...
## $ state                       <chr> "AK", "AL", "AR", "AS", "AZ", "CA", "CO...
## $ positive                    <dbl> 24909, 225910, 139855, 0, 287225, 10592...
## $ probableCases               <dbl> NA, 36449, 15690, NA, 8007, NA, 9148, 6...
## $ negative                    <dbl> 873890, 1307256, 1428285, 1988, 1758070...
## $ pending                     <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
## $ totalTestResultsSource      <chr> "totalTestsViral", "totalTestsViral", "...
## $ totalTestResults            <dbl> 898799, 1496717, 1552450, 1988, 2037288...
## $ hospitalizedCurrently       <dbl> 139, 1315, 891, NA, 1796, 5319, 1593, 8...
## $ hospitalizedCumulative      <dbl> 593, 23295, 8268, NA, 23871, NA, 11980,...
## $ inIcuCurrently              <dbl> NA, NA, 353, NA, 433, 1253, NA, NA, 34,...
## $ inIcuCumulative             <dbl> NA, 2182, NA, NA, NA, NA, NA, NA, NA, N...
## $ onVentilatorCurrently       <dbl> 14, NA, 143, NA, 227, NA, NA, NA, 10, N...
## $ onVentilatorCumulative      <dbl> NA, 1261, 925, NA, NA, NA, NA, NA, NA, ...
## $ recovered                   <dbl> 7165, 90702, 120545, NA, 46951, NA, 103...
## $ dataQualityGrade            <chr> "A", "A", "A+", "D", "A+", "B", "A", "C...
## $ lastUpdateEt                <chr> "11/19/2020 03:59", "11/19/2020 11:00",...
## $ dateModified                <dttm> 2020-11-19 03:59:00, 2020-11-19 11:00:...
## $ checkTimeEt                 <chr> "11/18 22:59", "11/19 06:00", "11/18 19...
## $ death                       <dbl> 101, 3419, 2297, 0, 6384, 18466, 2350, ...
## $ hospitalized                <dbl> 593, 23295, 8268, NA, 23871, NA, 11980,...
## $ dateChecked                 <dttm> 2020-11-19 03:59:00, 2020-11-19 11:00:...
## $ totalTestsViral             <dbl> 898799, 1496717, 1552450, 1988, NA, 215...
## $ positiveTestsViral          <dbl> 29798, NA, NA, NA, NA, NA, NA, NA, NA, ...
## $ negativeTestsViral          <dbl> 868482, NA, 1428285, NA, NA, NA, NA, NA...
## $ positiveCasesViral          <dbl> NA, 189461, 124165, 0, 279218, 1059267,...
## $ deathConfirmed              <dbl> 101, 3123, 2105, NA, 5919, NA, NA, 3862...
## $ deathProbable               <dbl> NA, 296, 192, NA, 465, NA, NA, 943, NA,...
## $ totalTestEncountersViral    <dbl> NA, NA, NA, NA, NA, NA, 2695700, NA, 60...
## $ totalTestsPeopleViral       <dbl> NA, NA, NA, NA, 2037288, NA, 1537359, N...
## $ totalTestsAntibody          <dbl> NA, NA, NA, NA, 354092, NA, 194983, NA,...
## $ positiveTestsAntibody       <dbl> NA, NA, NA, NA, NA, NA, 15141, NA, NA, ...
## $ negativeTestsAntibody       <dbl> NA, NA, NA, NA, NA, NA, 179842, NA, NA,...
## $ totalTestsPeopleAntibody    <dbl> NA, 68847, NA, NA, NA, NA, NA, NA, NA, ...
## $ positiveTestsPeopleAntibody <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
## $ negativeTestsPeopleAntibody <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
## $ totalTestsPeopleAntigen     <dbl> NA, NA, 107231, NA, NA, NA, NA, NA, NA,...
## $ positiveTestsPeopleAntigen  <dbl> NA, NA, 18437, NA, NA, NA, NA, NA, NA, ...
## $ totalTestsAntigen           <dbl> NA, NA, 21856, NA, NA, NA, NA, 26512, N...
## $ positiveTestsAntigen        <dbl> NA, NA, 3300, NA, NA, NA, NA, NA, NA, N...
## $ fips                        <chr> "02", "01", "05", "60", "04", "06", "08...
## $ positiveIncrease            <dbl> 490, 2424, 2238, 0, 4123, 11478, 6107, ...
## $ negativeIncrease            <dbl> 12751, 13211, 12983, 0, 14417, 122507, ...
## $ total                       <dbl> 898799, 1533166, 1568140, 1988, 2045295...
## $ totalTestResultsIncrease    <dbl> 13241, 15049, 14667, 0, 18293, 133985, ...
## $ posNeg                      <dbl> 898799, 1533166, 1568140, 1988, 2045295...
## $ deathIncrease               <dbl> 1, 72, 22, 0, 19, 106, 26, 21, 2, 0, 81...
## $ hospitalizedIncrease        <dbl> 9, 207, 129, 0, 398, 0, 178, 0, 0, 0, 2...
## $ hash                        <chr> "dd60caa3c0b65aa928204156ef405a95116f3f...
## $ commercialScore             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ negativeRegularScore        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ negativeScore               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ positiveScore               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ score                       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ grade                       <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
## 
## 
## Control totals - note that validState other than TRUE will be discarded
## 
## # A tibble: 2 x 6
##   validState    cases deaths  hosp     tests     n
##   <lgl>         <dbl>  <dbl> <dbl>     <dbl> <dbl>
## 1 FALSE         52410   1108    NA    474990  1245
## 2 TRUE       11503624 242567    NA 173236764 13428
## Rows: 13,428
## Columns: 6
## $ date   <date> 2020-11-19, 2020-11-19, 2020-11-19, 2020-11-19, 2020-11-19,...
## $ state  <chr> "AK", "AL", "AR", "AZ", "CA", "CO", "CT", "DC", "DE", "FL", ...
## $ cases  <dbl> 490, 2424, 2238, 4123, 11478, 6107, 2353, 213, 398, 8882, 27...
## $ deaths <dbl> 1, 72, 22, 19, 106, 26, 21, 2, 0, 81, 37, 0, 40, 14, 180, 59...
## $ hosp   <dbl> 139, 1315, 891, 1796, 5319, 1593, 840, 127, 165, 3380, 2142,...
## $ tests  <dbl> 13241, 15049, 14667, 18293, 133985, 55085, 36596, 6422, 1216...
## Rows: 13,428
## Columns: 14
## $ date   <date> 2020-01-22, 2020-01-22, 2020-01-23, 2020-01-23, 2020-01-24,...
## $ state  <chr> "MA", "WA", "MA", "WA", "MA", "WA", "MA", "WA", "MA", "WA", ...
## $ cases  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ deaths <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ hosp   <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
## $ tests  <dbl> 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, ...
## $ cpm    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ dpm    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ hpm    <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
## $ tpm    <dbl> 0.0000000, 0.0000000, 0.1471796, 0.0000000, 0.0000000, 0.000...
## $ cpm7   <dbl> NA, NA, NA, NA, NA, NA, 0, 0, 0, 0, 0, 0, 0, 0, NA, 0, 0, NA...
## $ dpm7   <dbl> NA, NA, NA, NA, NA, NA, 0, 0, 0, 0, 0, 0, 0, 0, NA, 0, 0, NA...
## $ hpm7   <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
## $ tpm7   <dbl> NA, NA, NA, NA, NA, NA, 0.04205130, 0.00000000, 0.06307695, ...
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` regrouping output by 'date', 'cluster' (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)

## 
## Recency is defined as 2020-10-21 through current
## 
## Recency is defined as 2020-10-21 through current

## `summarise()` regrouping output by 'state', 'cluster', 'date' (override with `.groups` argument)

## `summarise()` ungrouping output (override with `.groups` argument)

## `summarise()` ungrouping output (override with `.groups` argument)

## `summarise()` ungrouping output (override with `.groups` argument)

saveToRDS(test_old_201120, ovrWriteError=FALSE)
## 
## File already exists: ./RInputFiles/Coronavirus/test_old_201120.RDS 
## 
## Not replacing the existing file since ovrWrite=FALSE
## NULL
# Locations for the population, case, and death file
popLoc <- "./RInputFiles/Coronavirus/covid_county_population_usafacts.csv"
caseLoc <- "./RInputFiles/Coronavirus/covid_confirmed_usafacts_downloaded_20201120.csv"
deathLoc <- "./RInputFiles/Coronavirus/covid_deaths_usafacts_downloaded_20201120.csv"

# Run old segments against new data
cty_old_20201120 <- readRunUSAFacts(maxDate="2020-11-18", 
                                    popLoc=popLoc, 
                                    caseLoc=caseLoc, 
                                    deathLoc=deathLoc, 
                                    dlCaseDeath=!(file.exists(caseLoc) & file.exists(deathLoc)),
                                    oldFile=readFromRDS("cty_20201026")$dfBurden, 
                                    existingCountyClusters=readFromRDS("cty_20201026")$clustVec
                                    )
## 
## -- Column specification --------------------------------------------------------
## cols(
##   countyFIPS = col_double(),
##   `County Name` = col_character(),
##   State = col_character(),
##   population = col_double()
## )
## 
## -- Column specification --------------------------------------------------------
## cols(
##   .default = col_double(),
##   `County Name` = col_character(),
##   State = col_character()
## )
## i Use `spec()` for the full column specifications.
## Rows: 964,890
## Columns: 6
## $ countyFIPS <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ countyName <chr> "Statewide Unallocated", "Statewide Unallocated", "State...
## $ state      <chr> "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", "A...
## $ stateFIPS  <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...
## $ date       <date> 2020-01-22, 2020-01-23, 2020-01-24, 2020-01-25, 2020-01...
## $ cumCases   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## Warning: `expand_scale()` is deprecated; use `expansion()` instead.
## 
## -- Column specification --------------------------------------------------------
## cols(
##   .default = col_double(),
##   `County Name` = col_character(),
##   State = col_character()
## )
## i Use `spec()` for the full column specifications.
## Rows: 964,890
## Columns: 6
## $ countyFIPS <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ countyName <chr> "Statewide Unallocated", "Statewide Unallocated", "State...
## $ state      <chr> "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", "A...
## $ stateFIPS  <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...
## $ date       <date> 2020-01-22, 2020-01-23, 2020-01-24, 2020-01-25, 2020-01...
## $ cumDeaths  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## Warning: `expand_scale()` is deprecated; use `expansion()` instead.
## `geom_smooth()` using formula 'y ~ x'
## `summarise()` ungrouping output (override with `.groups` argument)

## `summarise()` ungrouping output (override with `.groups` argument)
## 
## Shapes will be created without any floor on the number of cases per million
## Shapes will be created without any floor on the number of deaths per million
## *** Counties with 0 cases/deaths or that fall below the floor for minCase/minDeath ***
## # A tibble: 1 x 4
##   cpm_mean_is0 dpm_mean_is0 dpm_mean_ltDeath cpm_mean_ltCase
##          <dbl>        <dbl>            <dbl>           <dbl>
## 1            0      0.00817                0               0
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` regrouping output by 'cluster' (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` regrouping output by 'date', 'cluster' (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)

## 
## Recency is defined as 2020-10-20 through current
## 
## Recency is defined as 2020-10-20 through current
## Warning: Removed 1 rows containing missing values (geom_point).
## Warning: Removed 2 rows containing missing values (geom_point).
## `summarise()` regrouping output by 'cluster' (override with `.groups` argument)
## `summarise()` regrouping output by 'cluster' (override with `.groups` argument)
## `summarise()` regrouping output by 'cluster' (override with `.groups` argument)
## `summarise()` regrouping output by 'cluster' (override with `.groups` argument)

## Warning: `expand_scale()` is deprecated; use `expansion()` instead.

## Joining, by = "fipsCounty"
## Joining, by = "fipsCounty"
## `summarise()` regrouping output by 'cluster' (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)

saveToRDS(cty_old_20201120, ovrWriteError=FALSE)
## 
## File already exists: ./RInputFiles/Coronavirus/cty_old_20201120.RDS 
## 
## Not replacing the existing file since ovrWrite=FALSE
## NULL

There appears to be a county-level anomaly for FIPS 22053 showing 18,544 cases diagnosed on 18-NOV in a county with population ~30,000 people. The process should be updated to allow for excluding spurious data points.

# Use data that have previously been downloaded
cdcLoc <- "Weekly_counts_of_deaths_by_jurisdiction_and_age_group_downloaded_20201120.csv"
cdcList_20201120 <- readRunCDCAllCause(loc=cdcLoc, 
                                       startYear=2015, 
                                       curYear=2020,
                                       weekThru=38, 
                                       startWeek=9, 
                                       lst=readFromRDS("test_old_201120"), 
                                       epiMap=readFromRDS("epiMonth"), 
                                       agePopData=readFromRDS("usPopBucket2020"), 
                                       cvDeathThru="2020-09-19", 
                                       cdcPlotStartWeek=10, 
                                       dlData=!file.exists(paste0("./RInputFiles/Coronavirus/", cdcLoc)), 
                                       stateNoCheck=c("NC")
                                       )
## Rows: 179,663
## Columns: 11
## $ Jurisdiction         <chr> "Alabama", "Alabama", "Alabama", "Alabama", "A...
## $ `Week Ending Date`   <chr> "01/10/2015", "01/17/2015", "01/24/2015", "01/...
## $ `State Abbreviation` <chr> "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL"...
## $ Year                 <int> 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015...
## $ Week                 <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,...
## $ `Age Group`          <chr> "25-44 years", "25-44 years", "25-44 years", "...
## $ `Number of Deaths`   <dbl> 67, 49, 55, 59, 47, 59, 41, 47, 59, 57, 54, 50...
## $ `Time Period`        <chr> "2015-2019", "2015-2019", "2015-2019", "2015-2...
## $ Type                 <chr> "Predicted (weighted)", "Predicted (weighted)"...
## $ Suppress             <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## $ Note                 <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## Rows: 179,663
## Columns: 11
## $ Jurisdiction <chr> "Alabama", "Alabama", "Alabama", "Alabama", "Alabama",...
## $ weekEnding   <date> 2015-01-10, 2015-01-17, 2015-01-24, 2015-01-31, 2015-...
## $ state        <chr> "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", ...
## $ year         <int> 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015, ...
## $ week         <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,...
## $ age          <chr> "25-44 years", "25-44 years", "25-44 years", "25-44 ye...
## $ deaths       <dbl> 67, 49, 55, 59, 47, 59, 41, 47, 59, 57, 54, 50, 58, 42...
## $ period       <chr> "2015-2019", "2015-2019", "2015-2019", "2015-2019", "2...
## $ type         <chr> "Predicted (weighted)", "Predicted (weighted)", "Predi...
## $ Suppress     <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## $ Note         <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## 
## Check Control Levels and Record Counts for Renamed Data:
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 6 x 4
##   age                    n n_deaths_na   deaths
##   <chr>              <int>       <int>    <dbl>
## 1 25-44 years        26907           4  3293947
## 2 45-64 years        32860          15 12885563
## 3 65-74 years        32851          16 12792037
## 4 75-84 years        32870          19 15899092
## 5 85 years and older 32856          19 20718264
## 6 Under 25 years     21319           2  1416502
## `summarise()` regrouping output by 'period', 'year' (override with `.groups` argument)
## # A tibble: 12 x 6
## # Groups:   period, year [6]
##    period     year type                     n n_deaths_na  deaths
##    <chr>     <int> <chr>                <int>       <int>   <dbl>
##  1 2015-2019  2015 Predicted (weighted) 15285           0 5416391
##  2 2015-2019  2015 Unweighted           15285           0 5416391
##  3 2015-2019  2016 Predicted (weighted) 15365           0 5483764
##  4 2015-2019  2016 Unweighted           15365           0 5483764
##  5 2015-2019  2017 Predicted (weighted) 15318           0 5643347
##  6 2015-2019  2017 Unweighted           15318           0 5643347
##  7 2015-2019  2018 Predicted (weighted) 15307           0 5698022
##  8 2015-2019  2018 Unweighted           15307           0 5698022
##  9 2015-2019  2019 Predicted (weighted) 15318           0 5725502
## 10 2015-2019  2019 Unweighted           15318           0 5725502
## 11 2020       2020 Predicted (weighted) 13260          41 5584252
## 12 2020       2020 Unweighted           13217          34 5487101
## `summarise()` regrouping output by 'period' (override with `.groups` argument)
## # A tibble: 3 x 5
## # Groups:   period [2]
##   period    Suppress                                       n n_deaths_na  deaths
##   <chr>     <chr>                                      <int>       <int>   <dbl>
## 1 2015-2019 <NA>                                      153186           0  5.59e7
## 2 2020      Suppressed (counts highly incomplete, <5~     75          75  0.    
## 3 2020      <NA>                                       26402           0  1.11e7
## `summarise()` regrouping output by 'period' (override with `.groups` argument)
## # A tibble: 9 x 5
## # Groups:   period [2]
##   period   Note                                            n n_deaths_na  deaths
##   <chr>    <chr>                                       <int>       <int>   <dbl>
## 1 2015-20~ <NA>                                       153186           0  5.59e7
## 2 2020     Data in recent weeks are incomplete. Only~  21043          34  9.13e6
## 3 2020     Data in recent weeks are incomplete. Only~    444           0  2.04e5
## 4 2020     Data in recent weeks are incomplete. Only~    339          22  4.45e4
## 5 2020     Data in recent weeks are incomplete. Only~   2241          19  7.10e5
## 6 2020     Data in recent weeks are incomplete. Only~     48           0  2.62e4
## 7 2020     Estimates for Pennsylvania are too low fo~     48           0  2.26e4
## 8 2020     Weights may be too low to account for und~    312           0  1.16e5
## 9 2020     <NA>                                         2002           0  8.22e5
## `summarise()` regrouping output by 'state' (override with `.groups` argument)
##    state         Jurisdiction    n n_deaths_na   deaths
## 1     US        United States 3660           0 33382812
## 2     CA           California 3660           0  3175134
## 3     FL              Florida 3660           0  2439960
## 4     TX                Texas 3660           0  2405990
## 5     PA         Pennsylvania 3660           0  1600457
## 6     OH                 Ohio 3660           0  1443190
## 7     IL             Illinois 3660           0  1260836
## 8     NY             New York 3660           0  1191716
## 9     MI             Michigan 3660           0  1148050
## 10    NC       North Carolina 3573          33  1068292
## 11    GA              Georgia 3659           0  1002322
## 12    NJ           New Jersey 3654           0   892684
## 13    TN            Tennessee 3660           0   872960
## 14    VA             Virginia 3660           0   802114
## 15    IN              Indiana 3658           0   776857
## 16    MO             Missouri 3656           0   755398
## 17    AZ              Arizona 3660           0   707602
## 18    MA        Massachusetts 3624           0   705559
## 19    YC        New York City 3656           0   689474
## 20    WA           Washington 3658          10   664276
## 21    AL              Alabama 3658           0   620268
## 22    WI            Wisconsin 3640           0   614607
## 23    MD             Maryland 3654           0   589675
## 24    SC       South Carolina 3658           0   582138
## 25    KY             Kentucky 3621           0   564915
## 26    LA            Louisiana 3655           0   545500
## 27    MN            Minnesota 3612           0   521445
## 28    CO             Colorado 3657           0   462126
## 29    OK             Oklahoma 3649           0   461666
## 30    OR               Oregon 3490           0   427650
## 31    MS          Mississippi 3595           0   376968
## 32    AR             Arkansas 3552           0   375477
## 33    CT          Connecticut 3215          17   367731
## 34    IA                 Iowa 3293           0   352522
## 35    PR          Puerto Rico 3372           0   343055
## 36    KS               Kansas 3348           0   307450
## 37    NV               Nevada 3394           0   299546
## 38    WV        West Virginia 3098          11   258798
## 39    UT                 Utah 3546           0   221678
## 40    NM           New Mexico 3231           0   212208
## 41    NE             Nebraska 2942           0   195453
## 42    ME                Maine 2732           0   165642
## 43    ID                Idaho 2856           0   158780
## 44    NH        New Hampshire 2751           0   139437
## 45    HI               Hawaii 2642           0   128673
## 46    RI         Rhode Island 2553           0   117919
## 47    MT              Montana 2638           0   114177
## 48    DE             Delaware 2645           0   102858
## 49    SD         South Dakota 2522           4    89560
## 50    ND         North Dakota 2512           0    78477
## 51    DC District of Columbia 2630           0    66270
## 52    VT              Vermont 2416           0    64000
## 53    WY              Wyoming 2395           0    49201
## 54    AK               Alaska 2433           0    43852
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## Rows: 179,663
## Columns: 11
## $ Jurisdiction <chr> "Alabama", "Alabama", "Alabama", "Alabama", "Alabama",...
## $ weekEnding   <date> 2015-01-10, 2015-01-17, 2015-01-24, 2015-01-31, 2015-...
## $ state        <chr> "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", ...
## $ year         <fct> 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015, ...
## $ week         <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,...
## $ age          <fct> 25-44 years, 25-44 years, 25-44 years, 25-44 years, 25...
## $ deaths       <dbl> 67, 49, 55, 59, 47, 59, 41, 47, 59, 57, 54, 50, 58, 42...
## $ period       <fct> 2015-2019, 2015-2019, 2015-2019, 2015-2019, 2015-2019,...
## $ type         <chr> "Predicted (weighted)", "Predicted (weighted)", "Predi...
## $ Suppress     <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## $ Note         <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## Rows: 87,863
## Columns: 11
## $ Jurisdiction <chr> "Alabama", "Alabama", "Alabama", "Alabama", "Alabama",...
## $ weekEnding   <date> 2015-01-10, 2015-01-17, 2015-01-24, 2015-01-31, 2015-...
## $ state        <chr> "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", ...
## $ year         <fct> 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015, ...
## $ week         <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,...
## $ age          <fct> 25-44 years, 25-44 years, 25-44 years, 25-44 years, 25...
## $ deaths       <dbl> 67, 49, 55, 59, 47, 59, 41, 47, 59, 57, 54, 50, 58, 42...
## $ period       <fct> 2015-2019, 2015-2019, 2015-2019, 2015-2019, 2015-2019,...
## $ type         <chr> "Predicted (weighted)", "Predicted (weighted)", "Predi...
## $ Suppress     <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## $ Note         <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## 
## 
##  *** Data suppression checks *** 
## # A tibble: 14 x 11
##    Jurisdiction weekEnding state year   week age   deaths period type  Suppress
##    <chr>        <date>     <chr> <fct> <int> <fct>  <dbl> <fct>  <chr> <chr>   
##  1 North Carol~ 2020-09-05 NC    2020     36 25-4~     NA 2020   Pred~ Suppres~
##  2 North Carol~ 2020-09-05 NC    2020     36 45-6~     NA 2020   Pred~ Suppres~
##  3 North Carol~ 2020-09-12 NC    2020     37 45-6~     NA 2020   Pred~ Suppres~
##  4 North Carol~ 2020-09-19 NC    2020     38 45-6~     NA 2020   Pred~ Suppres~
##  5 North Carol~ 2020-09-05 NC    2020     36 65-7~     NA 2020   Pred~ Suppres~
##  6 North Carol~ 2020-09-12 NC    2020     37 65-7~     NA 2020   Pred~ Suppres~
##  7 North Carol~ 2020-09-19 NC    2020     38 65-7~     NA 2020   Pred~ Suppres~
##  8 North Carol~ 2020-09-05 NC    2020     36 75-8~     NA 2020   Pred~ Suppres~
##  9 North Carol~ 2020-09-12 NC    2020     37 75-8~     NA 2020   Pred~ Suppres~
## 10 North Carol~ 2020-09-19 NC    2020     38 75-8~     NA 2020   Pred~ Suppres~
## 11 North Carol~ 2020-09-05 NC    2020     36 85 y~     NA 2020   Pred~ Suppres~
## 12 North Carol~ 2020-09-12 NC    2020     37 85 y~     NA 2020   Pred~ Suppres~
## 13 North Carol~ 2020-09-19 NC    2020     38 85 y~     NA 2020   Pred~ Suppres~
## 14 North Carol~ 2020-09-05 NC    2020     36 Unde~     NA 2020   Pred~ Suppres~
## # ... with 1 more variable: Note <chr>
## 
##  *** Data suppression checks failed - total of 14 suppressions
##  *** Of these suppressions, 10 are NOT from weekThru of current year
## Continuing since all states with problems are in stateNoCheck
## `summarise()` regrouping output by 'Jurisdiction', 'weekEnding', 'state', 'year', 'week', 'age', 'period', 'type' (override with `.groups` argument)
## Rows: 82,639
## Columns: 12
## $ Jurisdiction <chr> "Alabama", "Alabama", "Alabama", "Alabama", "Alabama",...
## $ weekEnding   <date> 2015-01-10, 2015-01-10, 2015-01-10, 2015-01-10, 2015-...
## $ state        <chr> "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", ...
## $ year         <fct> 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015, ...
## $ week         <int> 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, ...
## $ age          <fct> Under 25 years, 25-44 years, 45-64 years, 65-74 years,...
## $ period       <fct> 2015-2019, 2015-2019, 2015-2019, 2015-2019, 2015-2019,...
## $ type         <chr> "Predicted (weighted)", "Predicted (weighted)", "Predi...
## $ Suppress     <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## $ n            <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
## $ deaths       <dbl> 25, 67, 253, 202, 272, 320, 28, 49, 256, 222, 253, 332...
## $ Note         <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## 
## First duplicate is in row number (0 means no duplicates): 0
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` regrouping output by 'cluster' (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)

## `summarise()` regrouping output by 'year' (override with `.groups` argument)

## `summarise()` regrouping output by 'year', 'week' (override with `.groups` argument)

## `summarise()` regrouping output by 'year', 'week' (override with `.groups` argument)

## `summarise()` regrouping output by 'year', 'age', 'week' (override with `.groups` argument)

## 
## Plots will be run after excluding stateNoCheck states
## `summarise()` regrouping output by 'year' (override with `.groups` argument)

## `summarise()` ungrouping output (override with `.groups` argument)

## `summarise()` regrouping output by 'year' (override with `.groups` argument)

## `summarise()` regrouping output by 'year' (override with `.groups` argument)

## `summarise()` regrouping output by 'year' (override with `.groups` argument)

## `summarise()` regrouping output by 'year' (override with `.groups` argument)

## `summarise()` regrouping output by 'year' (override with `.groups` argument)

## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'state', 'quarter', 'month' (override with `.groups` argument)
## `summarise()` regrouping output by 'state' (override with `.groups` argument)

## `summarise()` regrouping output by 'state' (override with `.groups` argument)

## `summarise()` ungrouping output (override with `.groups` argument)
## Joining, by = "state"

## `summarise()` regrouping output by 'year' (override with `.groups` argument)

## `summarise()` regrouping output by 'year' (override with `.groups` argument)

## `summarise()` regrouping output by 'year' (override with `.groups` argument)

## `summarise()` regrouping output by 'year' (override with `.groups` argument)

## `summarise()` regrouping output by 'year' (override with `.groups` argument)

## `summarise()` regrouping output by 'year' (override with `.groups` argument)

## `summarise()` regrouping output by 'age', 'quarter', 'month' (override with `.groups` argument)
## `summarise()` regrouping output by 'age' (override with `.groups` argument)

## `summarise()` ungrouping output (override with `.groups` argument)

saveToRDS(cdcList_20201120, ovrWriteError=FALSE)
## 
## File already exists: ./RInputFiles/Coronavirus/cdcList_20201120.RDS 
## 
## Not replacing the existing file since ovrWrite=FALSE
## NULL
# Example for creating a full stateData file
fullStateList_20201120 <- integrateStateData(ctpList=readFromRDS("test_old_201120"), 
                                             usafData=readFromRDS("cty_old_20201120")$clusterStateData, 
                                             cdcList=readFromRDS("cdcList_20201120")
                                             )
## 
## Building stateData from the passed components
## Rows: 109,123
## Columns: 9
## $ state  <chr> "AK", "AK", "AK", "AK", "AK", "AK", "AK", "AK", "AK", "AK", ...
## $ date   <date> 2020-03-06, 2020-03-07, 2020-03-08, 2020-03-09, 2020-03-10,...
## $ metric <chr> "cases", "cases", "cases", "cases", "cases", "cases", "cases...
## $ source <chr> "CTP", "CTP", "CTP", "CTP", "CTP", "CTP", "CTP", "CTP", "CTP...
## $ name   <chr> "CTP_cases", "CTP_cases", "CTP_cases", "CTP_cases", "CTP_cas...
## $ value  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 5, 3, 3, 1, 10, 13, 4, 7...
## $ value7 <dbl> NA, NA, NA, 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.00...
## $ vpm    <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, ...
## $ vpm7   <dbl> NA, NA, NA, 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.00...
## # A tibble: 8 x 3
##   name             value     value7
##   <chr>            <dbl>      <dbl>
## 1 CDC_deaths    2340646.   2289614.
## 2 CDC_excess     255796.    254403.
## 3 CTP_cases    11503624   11013435.
## 4 CTP_deaths     242567     238041 
## 5 CTP_hosp     10124707    9790131.
## 6 CTP_tests   173236764  168394308.
## 7 USAF_cases   11269520   10812984.
## 8 USAF_deaths    245506     241695.
fullStateList_20201120
## $stateData
## # A tibble: 109,123 x 9
##    state date       metric source name      value value7   vpm   vpm7
##    <chr> <date>     <chr>  <chr>  <chr>     <dbl>  <dbl> <dbl>  <dbl>
##  1 AK    2020-03-06 cases  CTP    CTP_cases     0 NA         0 NA    
##  2 AK    2020-03-07 cases  CTP    CTP_cases     0 NA         0 NA    
##  3 AK    2020-03-08 cases  CTP    CTP_cases     0 NA         0 NA    
##  4 AK    2020-03-09 cases  CTP    CTP_cases     0  0         0  0    
##  5 AK    2020-03-10 cases  CTP    CTP_cases     0  0         0  0    
##  6 AK    2020-03-11 cases  CTP    CTP_cases     0  0         0  0    
##  7 AK    2020-03-12 cases  CTP    CTP_cases     0  0         0  0    
##  8 AK    2020-03-13 cases  CTP    CTP_cases     0  0         0  0    
##  9 AK    2020-03-14 cases  CTP    CTP_cases     0  0.429     0  0.580
## 10 AK    2020-03-15 cases  CTP    CTP_cases     0  1.14      0  1.55 
## # ... with 109,113 more rows
## 
## $popData
## # A tibble: 51 x 2
##    state      pop
##    <chr>    <dbl>
##  1 AK      738432
##  2 AL     4858979
##  3 AR     2978204
##  4 AZ     6828065
##  5 CA    39144818
##  6 CO     5456574
##  7 CT     3590886
##  8 DC      672228
##  9 DE      945934
## 10 FL    20271272
## # ... with 41 more rows
## 
## $caseDeath
## NULL
## 
## $curveList
## NULL
## 
## $cfrList
## NULL
# Example for using the full stateData file
glakesList_20201120 <- integrateStateData(stateData=fullStateList_20201120$stateData, 
                                          popData=fullStateList_20201120$popData,
                                          runAll=TRUE,
                                          keyStates=c("MN", "WI", "IL", "IN", "MI", "OH"),
                                          lagEarlyValue=NULL,
                                          lagLateValue=NULL
                                          )
## 
## A file has been passed for stateData, components will be ignored
## 
## Will scale by: 0.01264923

## 
## Will scale by: 0.01264923

## Warning: Removed 3 row(s) containing missing values (geom_path).

## # A tibble: 6 x 3
##   state earlyLag lateLag
##   <chr>    <int>   <int>
## 1 IN           2       0
## 2 MN           3       0
## 3 WI           3      29
## 4 IL           5       6
## 5 MI          10       2
## 6 OH          11      22

Data can be run on some of the states currently experiencing heavy outbreaks, with NY, MI, AZ as reference states that previously had high spikes:

# Example for using the full stateData file
plainsList_20201120 <- integrateStateData(stateData=fullStateList_20201120$stateData, 
                                          popData=fullStateList_20201120$popData,
                                          runAll=TRUE,
                                          keyStates=c("MN", "ND", "SD", "MT", "NE", 
                                                      "KS", "IA", "MI", "NY", "AZ"
                                                      ),
                                          combStates=c("ND"="Dakotas", "SD"="Dakotas"),
                                          lagEarlyValue=NULL,
                                          lagLateValue=NULL
                                          )
## 
## A file has been passed for stateData, components will be ignored
## 
## Will scale by: 0.02194667

## 
## Will scale by: 0.02194667

## # A tibble: 9 x 3
##   state   earlyLag lateLag
##   <chr>      <int>   <int>
## 1 KS             0      23
## 2 NE             0      12
## 3 MN             3       0
## 4 NY             6       2
## 5 IA             8       4
## 6 MI            10       2
## 7 Dakotas       19      20
## 8 AZ            21      19
## 9 MT            22      18

## Warning: Removed 10 rows containing missing values (position_stack).
## Warning: Removed 10 rows containing missing values (geom_text).

# Example for using hospitalizations
plainsListHosp_20201120 <- integrateStateData(stateData=fullStateList_20201120$stateData, 
                                              popData=fullStateList_20201120$popData,
                                              runAll=TRUE,
                                              var1="CTP_hosp",
                                              keyStates=c("MN", "ND", "SD", "MT", "NE", 
                                                          "IA", "MI", "NY", "AZ"
                                                          ),
                                              combStates=c("ND"="Dakotas", "SD"="Dakotas"),
                                              lagEarlyValue=NULL,
                                              lagLateValue=NULL
                                              )
## 
## A file has been passed for stateData, components will be ignored
## 
## Will scale by: 0.04109452

## 
## Will scale by: 0.04109452

## # A tibble: 8 x 3
##   state   earlyLag lateLag
##   <chr>      <int>   <int>
## 1 MN             0      11
## 2 NY             0       0
## 3 Dakotas        1       6
## 4 IA             1       7
## 5 MI             3       4
## 6 MT             3      10
## 7 AZ            26      12
## 8 NE            27      12

## Warning: Removed 9 rows containing missing values (position_stack).
## Warning: Removed 9 rows containing missing values (geom_text).

Two issues should be addressed, both related to x-variable and lag:

  1. Small data samples lead to unreasonable coefficients for y ~ x regression in early months
  2. Lack of data availability can crash the correlations process since there are no data points to correlate (occurs with hospitals in Kansas)

The assessStateCFR() function is updated to use a ceiling value as the plotted estimate any time an estimate exceeds the ceiling, and to set a lag of 0 if there is insufficient data for running correlations:

# Updated for automatic lag time assessment
assessStateCFR <- function(lst, 
                           keyStates, 
                           depVar, 
                           indepVar,
                           depTitleName, 
                           indepTitleName,
                           keyMetric="vpm7", 
                           lagEarlyDate=as.Date("2020-03-31"), 
                           lagMidDate=NULL,
                           lagLateDate=as.Date("2020-10-15"), 
                           lagEarlyValue=10, 
                           lagLateValue=20, 
                           lagsTry=0:30, 
                           maxCFR=0.2
                           ) {
    
    # FUNCTION ARGUMENTS:
    # lst: A list such as produced by createAndAlignCurves()
    # keyStates: The key states to be extracted from the list
    # depVar: the dependent variable
    # indepVar: the independent variable
    # depTitleName: the name for the dependent variable in the title
    # indepTitleName: the name for the independent variable in the plot title
    # keyMetric: the name of the key metric that is being assessed
    # lagEarlyDate: the date for the earliest lagging calculation (dates before this will be at lagEarlyValue)
    # lagMidDate: if lags are found from data, what midpoint should be used to split data as early vs late?
    #             NULL means midway between lagEarlyDate and lagLateDate
    # lagLateDate: the date for the latest lagging calculation (dates after this will be at lagLateValue)
    # lagEarlyValue: the value for lag on lagEarlyDate, will be linearly interpolated to lagLateValue/Date
    #                NULL means calculate from data and may differ by state
    # lagLateValue: the value for lag on lagLateDate, will be linearly interpolated from lagEarlyValue/Date
    #               NULL means estimate from data and may differ by state
    # lagsTry: the values for lag to be attempted if lageEarlyValue and/or lagLateValue is NULL
    # maxCFR: the maximum CFR to use (anything above will be scaled back to this ceiling value)
    
    # Extract the data for keyStates
    df <- lst[["dfList"]] %>%
        select_at(vars(all_of(c("state", "date", "name", keyMetric)))) %>%
        filter(state %in% keyStates, !is.na(get(keyMetric))) %>%
        pivot_wider(names_from="name", values_from=keyMetric)

    # Function for finding lag time correlations
    helperLagCor <- function(lt, lf, dp, id) {
        allStates <- lf %>%
            count(state) %>%
            select(-n)
        lf %>%
            group_by(state) %>%
            mutate(y=get(dp), x=lag(get(id), lt)) %>%
            filter(!is.na(y), !is.na(x)) %>%
            summarize(p=cor(x, y, use="complete.obs"), .groups="drop_last") %>%
            ungroup() %>%
            right_join(allStates, by="state") %>%
            mutate(lag=ifelse(is.na(lt), 0, lt))
    }
    
    # Middle date for splitting data
    if (is.null(lagMidDate)) lagMidDate <- mean(c(lagEarlyDate, lagLateDate))
    
    # Get the early lags from the data
    eLag <- map_dfr(.x=lagsTry, .f=helperLagCor, lf=filter(df, date<=lagMidDate), dp=depVar, id=indepVar) %>%
        group_by(state) %>%
        filter(p==max(p)) %>%
        filter(row_number()==1) %>%
        ungroup() %>%
        select(state, earlyLag=lag)
    
    # Get the late lags from the data
    lLag <- map_dfr(.x=lagsTry, .f=helperLagCor, lf=filter(df, date>lagMidDate), dp=depVar, id=indepVar) %>%
        group_by(state) %>%
        filter(p==max(p)) %>%
        filter(row_number()==1) %>%
        ungroup() %>%
        select(state, lateLag=lag)
    
    # Create the full lag frame, including substituting the fixed value(s) if provided
    lagFrame <- eLag %>%
        full_join(lLag, by="state") %>%
        mutate(earlyLag=ifelse(is.na(earlyLag), 0, earlyLag), lateLag=ifelse(is.na(lateLag), 0, lateLag))
    if (!is.null(lagEarlyValue)) lagFrame <- lagFrame %>% mutate(earlyLag=lagEarlyValue)
    if (!is.null(lagLateValue)) lagFrame <- lagFrame %>% mutate(lateLag=lagLateValue)
    print(lagFrame)
    
    # Apply the assumed lagging information
    fullTime <- as.integer(lagLateDate-lagEarlyDate)
    df <- df %>%
        left_join(lagFrame, by="state") %>%
        arrange(state, date) %>%
        group_by(state) %>%
        mutate(eLag=lag(get(indepVar), mean(earlyLag)), 
               lLag=lag(get(indepVar), mean(lateLag)), 
               pctEarly=pmin(pmax(as.integer(lagLateDate-date)/fullTime, 0), 1), 
               x=ifelse(is.na(eLag), NA, pctEarly*eLag + (1-pctEarly)*ifelse(is.na(lLag), 0, lLag)), 
               y=get(depVar),
               mon=factor(month.abb[lubridate::month(date)], levels=month.abb)
               ) %>%
        filter(!is.na(x)) %>%
        ungroup()
    
    # Regression for data from keyStates
    if (length(keyStates) > 1) stateLM <- lm(y ~ x:mon:state + 0, data=df, na.action=na.exclude) 
    else stateLM <- lm(y ~ x:mon + 0, data=df, na.action=na.exclude)
    
    # Add the predicted value to df
    df <- df %>%
        mutate(pred=predict(stateLM))
    
    # Plot of curve overlaps
    p1 <- df %>%
        select(state, date, y, pred) %>%
        pivot_longer(-c(state, date)) %>%
        ggplot(aes(x=date, y=value)) + 
        geom_line(aes(color=c("pred"="Predicted", "y"="Actual")[name], group=name)) + 
        scale_x_date(date_breaks="1 month", date_labels="%b") + 
        labs(x="", 
             y=stringr::str_to_title(depTitleName), 
             title=paste0("Predicted vs. actual ", depTitleName)
             ) +
        scale_color_discrete("Metric") +
        facet_wrap(~state)
    print(p1)
    
    # Plot of rate by month
    p2 <- coef(stateLM) %>%
        as.data.frame() %>%
        purrr::set_names("CFR") %>%
        filter(!is.na(CFR)) %>%
        tibble::rownames_to_column("monState") %>%
        mutate(mon=factor(stringr::str_replace_all(monState, pattern="x:mon|:state.+", replacement=""), 
                          levels=month.abb
                          ), 
               state=if (length(keyStates)==1) keyStates 
                     else stringr::str_replace_all(monState, pattern="x:mon[A-Za-z]{3}:state", replacement=""), 
               colorUse=ifelse(CFR>maxCFR, "red", "lightblue"), 
               CFR=pmin(CFR, maxCFR)
               ) %>%
        left_join(lagFrame, by="state") %>%
        ggplot(aes(x=mon, y=CFR)) + 
        geom_col(aes(fill=colorUse)) + 
        geom_text(aes(y=CFR/2, label=paste0(round(100*CFR, 1), "%"))) +
        geom_text(data=~filter(., mon==month.abb[lubridate::month(lagMidDate)]), 
                  aes(x=-Inf, y=Inf, label=paste0("Early Lag: ", earlyLag)), 
                  hjust=0, 
                  vjust=1
                  ) + 
        geom_text(data=~filter(., mon==month.abb[lubridate::month(lagMidDate)]), 
                  aes(x=Inf, y=Inf, label=paste0("Late Lag: ", lateLag)), 
                  hjust=1, 
                  vjust=1
                  ) + 
        labs(x="", 
             y=paste0(stringr::str_to_title(depTitleName), " as percentage of lagged ", indepTitleName), 
             title=paste0(stringr::str_to_title(depTitleName), 
                          " vs. lagged ", 
                          indepTitleName, 
                          " in state(s): ", 
                          paste0(keyStates, collapse=", ")
                          ), 
             subtitle=paste0("Assumed early lag on ", 
                             lagEarlyDate,
                             " interpolated to late lag on ", 
                             lagLateDate, 
                             "\nValues above ", 
                             maxCFR, 
                             " reported as ", 
                             maxCFR, 
                             " and flagged in red"
                             ), 
             caption="Linear model coefficients on lagged data with no intercept used to estimate percentage"
             ) + 
        scale_fill_identity() +
        facet_wrap(~state)
    print(p2)

    # Return the data frame
    df
    
}

The updated function is then run:

# Example for using the full stateData file
plainsList_20201120 <- integrateStateData(stateData=fullStateList_20201120$stateData, 
                                          popData=fullStateList_20201120$popData,
                                          runAll=TRUE,
                                          keyStates=c("MN", "ND", "SD", "MT", "NE", 
                                                      "KS", "IA", "MI", "NY", "FL"
                                                      ),
                                          combStates=c("ND"="Dakotas", "SD"="Dakotas"),
                                          lagEarlyValue=NULL,
                                          lagLateValue=NULL
                                          )
## 
## A file has been passed for stateData, components will be ignored
## 
## Will scale by: 0.02194667

## 
## Will scale by: 0.02194667

## # A tibble: 9 x 3
##   state   earlyLag lateLag
##   <chr>      <int>   <int>
## 1 KS             0      23
## 2 NE             0      12
## 3 MN             3       0
## 4 NY             6       2
## 5 IA             8       4
## 6 MI            10       2
## 7 Dakotas       19      20
## 8 MT            22      18
## 9 FL            23      19

# Example for using hospitalizations
plainsListHosp_20201120 <- integrateStateData(stateData=fullStateList_20201120$stateData, 
                                              popData=fullStateList_20201120$popData,
                                              runAll=TRUE,
                                              var1="CTP_hosp",
                                              keyStates=c("MN", "ND", "SD", "MT", "NE", 
                                                          "IA", "MI", "NY", "FL"
                                                          ),
                                              combStates=c("ND"="Dakotas", "SD"="Dakotas"),
                                              lagEarlyValue=NULL,
                                              lagLateValue=NULL
                                              )
## 
## A file has been passed for stateData, components will be ignored
## 
## Will scale by: 0.04109452

## 
## Will scale by: 0.04109452

## # A tibble: 8 x 3
##   state   earlyLag lateLag
##   <chr>      <dbl>   <int>
## 1 Dakotas        0       6
## 2 MN             0      11
## 3 NY             0       0
## 4 IA             1       7
## 5 MI             3       4
## 6 MT             3      10
## 7 NE            27      12
## 8 FL             0      13

The issues appear to be addressed, with the function now running as expected.

The process can be run using the state clusters:

# Get the cluster file
clustData <- readFromRDS("test_hier5_201025")$useClusters
clustStates <- names(clustData)
clustData <- paste0("Cluster ", clustData)
names(clustData) <- clustStates

# Example for using the full stateData file
clustList_20201120 <- integrateStateData(stateData=fullStateList_20201120$stateData, 
                                         popData=fullStateList_20201120$popData,
                                         runAll=TRUE,
                                         keyStates=clustStates,
                                         combStates=clustData,
                                         lagEarlyValue=NULL,
                                         lagLateValue=NULL
                                         )
## 
## A file has been passed for stateData, components will be ignored
## 
## Will scale by: 0.04979147

## 
## Will scale by: 0.04979147

## # A tibble: 5 x 3
##   state     earlyLag lateLag
##   <chr>        <int>   <int>
## 1 Cluster 3        2      23
## 2 Cluster 5        4      11
## 3 Cluster 4        6      19
## 4 Cluster 1       12      13
## 5 Cluster 2       23      22

# Example for using hospitalizations
clustListHosp_20201120 <- integrateStateData(stateData=fullStateList_20201120$stateData, 
                                             popData=fullStateList_20201120$popData,
                                             runAll=TRUE,
                                             var1="CTP_hosp",
                                             keyStates=clustStates,
                                             combStates=clustData,
                                             lagEarlyValue=NULL,
                                             lagLateValue=NULL
                                             )
## 
## A file has been passed for stateData, components will be ignored
## 
## Will scale by: 0.04054712

## 
## Will scale by: 0.04054712

## # A tibble: 5 x 3
##   state     earlyLag lateLag
##   <chr>        <int>   <int>
## 1 Cluster 1        1       0
## 2 Cluster 3        1      12
## 3 Cluster 4        1       6
## 4 Cluster 2        5      14
## 5 Cluster 5        7       2

A function is written to run the component states for a given cluster:

# Function to run the process for all states in specified cluster
regionComponents <- function(lst, 
                             clustVec,
                             clustNum,
                             combStates=vector("character", 0), 
                             lagEarlyValue=NULL,
                             lagLateValue=NULL,
                             returnData=FALSE,
                             ...
                             ) 
    {
    
    # FUNCTION ARGUMENTS:
    # lst: a processed list containing burden data and population data
    # clustVec: a named vector containing the assignment for each state
    # clustNum: cluster number to use OR a specific state to key on (that state's cluster will be used)
    # combStates: states that should be combined together for plotting (named vector, c("state"="newName"))
    # lagEarlyValue: can force an early lag by using an integer (NULL means estimate from data)
    # lagLateValue: can force an early lag by using an integer (NULL means estimate from data)
    # returnData: whether to return the data files produced
    # ...: other arguments to pass to integrateStateData()
    
    # Get the states to run from clustVec
    # If an integer, pull the states directly; if a character, pull states matching the state name passed
    if ("numeric" %in% class(clustNum) | "integer" %in% class(clustNum)) {
        keyStates <- names(clustVec)[clustVec %in% clustNum]
    } else if ("character" %in% class(clustNum)) {
        clustUse <- unique(clustVec[names(clustVec) %in% clustNum])
        keyStates <- names(clustVec)[clustVec %in% clustUse]
    } else {
        stop("\nCannot determine the desired states to plot, investigate and re-run\n")
    }
    
    # Run for cases vs. deaths
    caseDeathList <- integrateStateData(stateData=lst$stateData, 
                                        popData=lst$popData,
                                        runAll=TRUE,
                                        keyStates=keyStates,
                                        combStates=combStates,
                                        lagEarlyValue=lagEarlyValue,
                                        lagLateValue=lagLateValue, 
                                        ...
                                        )
    
    # Run for hospitalized vs. deaths
    hospDeathList <- integrateStateData(stateData=lst$stateData, 
                                        popData=lst$popData,
                                        runAll=TRUE,
                                        var1="CTP_hosp",
                                        keyStates=keyStates,
                                        combStates=combStates,
                                        lagEarlyValue=lagEarlyValue,
                                        lagLateValue=lagLateValue, 
                                        ...
                                        )
    
    # Return data if requested
    if (returnData) list(caseDeathList=caseDeathList, hospDeathList=hospDeathList)
    
}

The function is then run for the NY/NJ cluster:

regionComponents(fullStateList_20201120, 
                 clustVec=readFromRDS("test_hier5_201025")$useClusters, 
                 clustNum=c("NY", "NJ")
                 )
## 
## A file has been passed for stateData, components will be ignored
## 
## Will scale by: 0.07191544

## 
## Will scale by: 0.07191544

## Warning: Removed 3 row(s) containing missing values (geom_path).

## # A tibble: 4 x 3
##   state earlyLag lateLag
##   <chr>    <int>   <int>
## 1 CT           5      14
## 2 MA           6       1
## 3 NY           6       2
## 4 NJ           8      25

## 
## A file has been passed for stateData, components will be ignored
## 
## Will scale by: 0.04109452

## 
## Will scale by: 0.04109452

## Warning: Removed 1 row(s) containing missing values (geom_path).

## # A tibble: 4 x 3
##   state earlyLag lateLag
##   <chr>    <int>   <int>
## 1 MA           0       0
## 2 NJ           0      11
## 3 NY           0       0
## 4 CT           3      11

The function is then run for the southern cluster:

regionComponents(fullStateList_20201120, 
                 clustVec=readFromRDS("test_hier5_201025")$useClusters, 
                 clustNum=c("FL", "AZ")
                 )
## 
## A file has been passed for stateData, components will be ignored
## 
## Will scale by: 0.01892897

## 
## Will scale by: 0.01892897

## Warning: Removed 3 row(s) containing missing values (geom_path).

## # A tibble: 7 x 3
##   state earlyLag lateLag
##   <chr>    <int>   <int>
## 1 NV           8      26
## 2 AL          10       2
## 3 GA          12      20
## 4 TX          16      15
## 5 AZ          21      19
## 6 FL          23      19
## 7 SC          26      30

## 
## A file has been passed for stateData, components will be ignored
## 
## Will scale by: 0.02398844

## 
## Will scale by: 0.02398844

## Warning: Removed 3 row(s) containing missing values (geom_path).

## # A tibble: 7 x 3
##   state earlyLag lateLag
##   <chr>    <dbl>   <int>
## 1 AL           0       0
## 2 NV           5      15
## 3 TX           7      22
## 4 AZ          26      12
## 5 SC          26       8
## 6 GA          30      19
## 7 FL           0      13

The function is then run for the other early northern cluster:

regionComponents(fullStateList_20201120, 
                 clustVec=readFromRDS("test_hier5_201025")$useClusters, 
                 clustNum=c("MI")
                 )
## 
## A file has been passed for stateData, components will be ignored
## 
## Will scale by: 0.0187579

## 
## Will scale by: 0.0187579

## Warning: Removed 3 row(s) containing missing values (geom_path).

## # A tibble: 10 x 3
##    state earlyLag lateLag
##    <chr>    <int>   <int>
##  1 MD           0       8
##  2 DE           1      26
##  3 IN           2       0
##  4 DC           4      25
##  5 IL           5       6
##  6 MI          10       2
##  7 LA          11      19
##  8 RI          11       4
##  9 PA          12      17
## 10 MS          15      26

## 
## A file has been passed for stateData, components will be ignored
## 
## Will scale by: 0.02716523

## 
## Will scale by: 0.02716523

## Warning: Removed 3 row(s) containing missing values (geom_path).

## # A tibble: 10 x 3
##    state earlyLag lateLag
##    <chr>    <int>   <int>
##  1 DC           0       8
##  2 DE           0      19
##  3 MD           0       3
##  4 LA           1      11
##  5 RI           1      20
##  6 MI           3       4
##  7 IN           5       6
##  8 IL          10       2
##  9 PA          10       0
## 10 MS          25      10

The function is then run for the states impacted later:

regionComponents(fullStateList_20201120, 
                 clustVec=readFromRDS("test_hier5_201025")$useClusters, 
                 clustNum=c("ND")
                 )
## 
## A file has been passed for stateData, components will be ignored
## 
## Will scale by: 0.01257989

## 
## Will scale by: 0.01257989

## Warning: Removed 3 row(s) containing missing values (geom_path).

## # A tibble: 25 x 3
##    state earlyLag lateLag
##    <chr>    <int>   <int>
##  1 KS           0      23
##  2 KY           0      20
##  3 NC           0      26
##  4 NE           0      12
##  5 VA           0      16
##  6 AK           3       3
##  7 MN           3       0
##  8 OR           3      14
##  9 WI           3      29
## 10 AR           6       0
## # ... with 15 more rows

## 
## A file has been passed for stateData, components will be ignored
## 
## Will scale by: 0.03541542

## 
## Will scale by: 0.03541542

## Warning: Removed 3 row(s) containing missing values (geom_path).

## # A tibble: 25 x 3
##    state earlyLag lateLag
##    <chr>    <dbl>   <int>
##  1 KY           0      10
##  2 MN           0      11
##  3 NC           0      10
##  4 NM           0       8
##  5 OR           0      11
##  6 VA           0      30
##  7 IA           1       7
##  8 AR           2       0
##  9 WV           2       1
## 10 CA           3      27
## # ... with 15 more rows

This segment should likely be split further, both due to size and to meaningful differences by state in disease spike in November.

The function is then run for the handful of outlier states:

regionComponents(fullStateList_20201120, 
                 clustVec=readFromRDS("test_hier5_201025")$useClusters, 
                 clustNum=c("CO")
                 )
## 
## A file has been passed for stateData, components will be ignored
## 
## Will scale by: 0.008039316

## 
## Will scale by: 0.008039316

## Warning: Removed 3 row(s) containing missing values (geom_path).

## # A tibble: 5 x 3
##   state earlyLag lateLag
##   <chr>    <int>   <int>
## 1 CO           0      23
## 2 ME           6       7
## 3 NH          11      27
## 4 VT          11      25
## 5 WA          11      16

## 
## A file has been passed for stateData, components will be ignored
## 
## Will scale by: 0.02771792

## 
## Will scale by: 0.02771792

## Warning: Removed 3 row(s) containing missing values (geom_path).

## # A tibble: 5 x 3
##   state earlyLag lateLag
##   <chr>    <int>   <int>
## 1 CO           0      11
## 2 VT           2      13
## 3 ME           4       2
## 4 NH           6      10
## 5 WA          16       4

Data are also produced to show the early outlier region and low outlier region together:

regionComponents(fullStateList_20201120, 
                 clustVec=readFromRDS("test_hier5_201025")$useClusters, 
                 clustNum=c("CO", "NY")
                 )
## 
## A file has been passed for stateData, components will be ignored
## 
## Will scale by: 0.04185309

## 
## Will scale by: 0.04185309

## Warning: Removed 3 row(s) containing missing values (geom_path).

## # A tibble: 9 x 3
##   state earlyLag lateLag
##   <chr>    <int>   <int>
## 1 CO           0      23
## 2 CT           5      14
## 3 MA           6       1
## 4 ME           6       7
## 5 NY           6       2
## 6 NJ           8      25
## 7 NH          11      27
## 8 VT          11      25
## 9 WA          11      16

## 
## A file has been passed for stateData, components will be ignored
## 
## Will scale by: 0.04109452

## 
## Will scale by: 0.04109452

## Warning: Removed 1 row(s) containing missing values (geom_path).

## # A tibble: 9 x 3
##   state earlyLag lateLag
##   <chr>    <int>   <int>
## 1 CO           0      11
## 2 MA           0       0
## 3 NJ           0      11
## 4 NY           0       0
## 5 VT           2      13
## 6 CT           3      11
## 7 ME           4       2
## 8 NH           6      10
## 9 WA          16       4

Full Update (new segments, late November)

New data from COVID Tracking Project are downloaded, and new state-level segments are created using similar business rules as previous:

# Use existing segments with updated data
locDownload <- "./RInputFiles/Coronavirus/CV_downloaded_201130.csv"
test_hier5_201130 <- readRunCOVIDTrackingProject(thruLabel="Nov 29, 2020", 
                                                 downloadTo=if(file.exists(locDownload)) NULL else locDownload,
                                                 readFrom=locDownload, 
                                                 compareFile=readFromRDS("test_hier5_201025")$dfRaw,
                                                 hierarchical=TRUE, 
                                                 reAssignState=list("VT"="ME"), 
                                                 kCut=6, 
                                                 minShape=3, 
                                                 ratioDeathvsCase = 5, 
                                                 ratioTotalvsShape = 0.25, 
                                                 minDeath=100, 
                                                 minCase=10000
                                                 )
## 
## -- Column specification --------------------------------------------------------
## cols(
##   .default = col_double(),
##   state = col_character(),
##   totalTestResultsSource = col_character(),
##   dataQualityGrade = col_character(),
##   lastUpdateEt = col_character(),
##   dateModified = col_datetime(format = ""),
##   checkTimeEt = col_character(),
##   dateChecked = col_datetime(format = ""),
##   fips = col_character(),
##   hash = col_character(),
##   grade = col_logical()
## )
## i Use `spec()` for the full column specifications.
## 
## File is unique by state and date
## 
## 
## Overall control totals in file:
## # A tibble: 1 x 3
##   positiveIncrease deathIncrease hospitalizedCurrently
##              <dbl>         <dbl>                 <dbl>
## 1         13188675        257920              11071946
## 
## *** COMPARISONS TO REFERENCE FILE: compareFile
## 
## Checkin for similarity of: column names
## In reference but not in current: 
## In current but not in reference: 
## 
## Checkin for similarity of: states
## In reference but not in current: 
## In current but not in reference: 
## 
## Checkin for similarity of: dates
## In reference but not in current: 
## In current but not in reference: 2020-11-29 2020-11-28 2020-11-27 2020-11-26 2020-11-25 2020-11-24 2020-11-23 2020-11-22 2020-11-21 2020-11-20 2020-11-19 2020-11-18 2020-11-17 2020-11-16 2020-11-15 2020-11-14 2020-11-13 2020-11-12 2020-11-11 2020-11-10 2020-11-09 2020-11-08 2020-11-07 2020-11-06 2020-11-05 2020-11-04 2020-11-03 2020-11-02 2020-11-01 2020-10-31 2020-10-30 2020-10-29 2020-10-28 2020-10-27 2020-10-26 2020-10-25
## 
## *** Difference of at least 5 and difference is at least 1%:
## Joining, by = c("date", "name")
##           date                  name newValue oldValue
## 1   2020-03-05      positiveIncrease       86      103
## 2   2020-03-06      positiveIncrease      127      109
## 3   2020-03-07      positiveIncrease      133      176
## 4   2020-03-09      positiveIncrease      285      292
## 5   2020-03-10      positiveIncrease      439      387
## 6   2020-03-11      positiveIncrease      503      509
## 7   2020-03-12      positiveIncrease      742      671
## 8   2020-03-13      positiveIncrease      952     1072
## 9   2020-03-14      positiveIncrease      982      924
## 10  2020-03-15      positiveIncrease     1189     1291
## 11  2020-03-16      positiveIncrease     1849     1739
## 12  2020-03-17      positiveIncrease     2246     2588
## 13  2020-03-18      positiveIncrease     3364     3089
## 14  2020-03-19      positiveIncrease     4705     4651
## 15  2020-03-21 hospitalizedCurrently     1492     1436
## 16  2020-03-23 hospitalizedCurrently     2812     2770
## 17  2020-03-25 hospitalizedCurrently     5140     5062
## 18  2020-03-28      positiveIncrease    19599    19925
## 19  2020-03-28         deathIncrease      551      544
## 20  2020-03-29         deathIncrease      504      515
## 21  2020-03-30      positiveIncrease    21485    22042
## 22  2020-03-31      positiveIncrease    25174    24853
## 23  2020-03-31         deathIncrease      907      890
## 24  2020-04-01      positiveIncrease    26128    25791
## 25  2020-04-04      positiveIncrease    32867    33212
## 26  2020-04-06      positiveIncrease    28410    29002
## 27  2020-04-09      positiveIncrease    35116    34503
## 28  2020-04-10      positiveIncrease    33473    34380
## 29  2020-04-10         deathIncrease     2072     2108
## 30  2020-04-11      positiveIncrease    31092    30501
## 31  2020-04-11         deathIncrease     2079     2054
## 32  2020-04-13      positiveIncrease    24384    25195
## 33  2020-04-14      positiveIncrease    26080    25719
## 34  2020-04-15      positiveIncrease    29859    30307
## 35  2020-04-16      positiveIncrease    31581    30978
## 36  2020-04-23         deathIncrease     1814     1791
## 37  2020-04-24         deathIncrease     1972     1895
## 38  2020-04-25         deathIncrease     1627     1748
## 39  2020-04-27         deathIncrease     1287     1270
## 40  2020-04-29         deathIncrease     2685     2713
## 41  2020-05-01         deathIncrease     1808     1779
## 42  2020-05-02         deathIncrease     1531     1562
## 43  2020-05-05         deathIncrease     2496     2452
## 44  2020-05-06         deathIncrease     1915     1948
## 45  2020-05-07      positiveIncrease    27224    27537
## 46  2020-05-08         deathIncrease     1780     1798
## 47  2020-05-12      positiveIncrease    22559    22890
## 48  2020-05-12         deathIncrease     1505     1486
## 49  2020-05-13      positiveIncrease    21627    21285
## 50  2020-05-13         deathIncrease     1736     1704
## 51  2020-05-14         deathIncrease     1854     1879
## 52  2020-05-15      positiveIncrease    25422    24685
## 53  2020-05-15         deathIncrease     1265     1507
## 54  2020-05-16      positiveIncrease    23593    24702
## 55  2020-05-16         deathIncrease     1195      987
## 56  2020-05-18         deathIncrease      890      848
## 57  2020-05-21         deathIncrease     1426     1394
## 58  2020-05-22      positiveIncrease    24173    24433
## 59  2020-05-22         deathIncrease     1303     1341
## 60  2020-05-23      positiveIncrease    22365    21531
## 61  2020-05-23         deathIncrease     1035     1063
## 62  2020-05-24      positiveIncrease    18859    20072
## 63  2020-05-25         deathIncrease      553      559
## 64  2020-05-26         deathIncrease      673      645
## 65  2020-05-28         deathIncrease     1245     1231
## 66  2020-05-29         deathIncrease     1167     1184
## 67  2020-05-30      positiveIncrease    23437    23682
## 68  2020-05-30         deathIncrease      917      932
## 69  2020-06-02         deathIncrease     1000      962
## 70  2020-06-03      positiveIncrease    20155    20390
## 71  2020-06-03         deathIncrease      979      993
## 72  2020-06-04      positiveIncrease    20383    20886
## 73  2020-06-04         deathIncrease      868      893
## 74  2020-06-05      positiveIncrease    23065    23394
## 75  2020-06-05         deathIncrease      840      826
## 76  2020-06-06      positiveIncrease    22560    23064
## 77  2020-06-06         deathIncrease      710      728
## 78  2020-06-08         deathIncrease      679      661
## 79  2020-06-12      positiveIncrease    23095    23597
## 80  2020-06-12         deathIncrease      763      775
## 81  2020-06-15         deathIncrease      407      381
## 82  2020-06-16         deathIncrease      707      730
## 83  2020-06-17         deathIncrease      794      767
## 84  2020-06-18      positiveIncrease    27088    27746
## 85  2020-06-18         deathIncrease      690      705
## 86  2020-06-19      positiveIncrease    30960    31471
## 87  2020-06-20      positiveIncrease    31950    32294
## 88  2020-06-20         deathIncrease      611      629
## 89  2020-06-21      positiveIncrease    28848    27928
## 90  2020-06-22         deathIncrease      295      286
## 91  2020-06-23      positiveIncrease    33884    33447
## 92  2020-06-23         deathIncrease      725      710
## 93  2020-06-24         deathIncrease      706      724
## 94  2020-06-25         deathIncrease      664      647
## 95  2020-06-26         deathIncrease      625      637
## 96  2020-06-27         deathIncrease      502      511
## 97  2020-06-29         deathIncrease      358      332
## 98  2020-06-30         deathIncrease      585      596
## 99  2020-07-01         deathIncrease      688      701
## 100 2020-07-02      positiveIncrease    53508    54085
## 101 2020-07-04         deathIncrease      300      306
## 102 2020-07-06      positiveIncrease    41494    41959
## 103 2020-07-06         deathIncrease      266      243
## 104 2020-07-07         deathIncrease      904      923
## 105 2020-07-09         deathIncrease      900      867
## 106 2020-07-10         deathIncrease      822      854
## 107 2020-07-17         deathIncrease      935      951
## 108 2020-07-20         deathIncrease      368      363
## 109 2020-07-21         deathIncrease     1070     1039
## 110 2020-07-22         deathIncrease     1136     1171
## 111 2020-07-24         deathIncrease     1190     1176
## 112 2020-07-25         deathIncrease     1008     1023
## 113 2020-07-26      positiveIncrease    60123    61000
## 114 2020-08-01      positiveIncrease    60247    61101
## 115 2020-08-02         deathIncrease      492      498
## 116 2020-08-03         deathIncrease      536      519
## 117 2020-08-04         deathIncrease     1238     1255
## 118 2020-08-08      positiveIncrease    53084    53712
## 119 2020-08-10         deathIncrease      437      426
## 120 2020-08-14      positiveIncrease    57093    55636
## 121 2020-08-17      positiveIncrease    37411    37880
## 122 2020-08-17         deathIncrease      418      407
## 123 2020-08-21         deathIncrease     1108     1123
## 124 2020-08-22      positiveIncrease    45723    46236
## 125 2020-08-24      positiveIncrease    34250    34643
## 126 2020-08-24         deathIncrease      352      343
## 127 2020-08-29      positiveIncrease    43967    44501
## 128 2020-08-31         deathIncrease      380      366
## 129 2020-09-02      positiveIncrease    30217    30603
## 130 2020-09-07      positiveIncrease    28143    28682
## 131 2020-09-09         deathIncrease     1102     1084
## 132 2020-09-15      positiveIncrease    34904    35445
## 133 2020-09-15         deathIncrease     1044     1031
## 134 2020-09-16         deathIncrease     1184     1200
## 135 2020-09-19      positiveIncrease    44905    45564
## 136 2020-09-20      positiveIncrease    35503    36295
## 137 2020-09-22         deathIncrease      866      854
## 138 2020-09-23         deathIncrease     1147     1159
## 139 2020-09-27      positiveIncrease    34987    35454
## 140 2020-09-27         deathIncrease      302      307
## 141 2020-09-28      positiveIncrease    35883    36524
## 142 2020-09-28         deathIncrease      265      257
## 143 2020-09-29      positiveIncrease    36441    36947
## 144 2020-10-03         deathIncrease      733      741
## 145 2020-10-04      positiveIncrease    37988    38439
## 146 2020-10-05         deathIncrease      341      326
## 147 2020-10-11      positiveIncrease    46269    46946
## 148 2020-10-12      positiveIncrease    42643    43124
## 149 2020-10-13         deathIncrease      700      690
## 150 2020-10-14      positiveIncrease    56117    56797
## 151 2020-10-15         deathIncrease      937      951
## 152 2020-10-16         deathIncrease      891      877
## 153 2020-10-17      positiveIncrease    57330    57943
## 154 2020-10-17         deathIncrease      762      780
## 155 2020-10-18      positiveIncrease    48284    48922
## 156 2020-10-19         deathIncrease      461      456
## 157 2020-10-21      positiveIncrease    60953    58606
## 158 2020-10-22      positiveIncrease    72842    75248
## Joining, by = c("date", "name")
## Warning: Removed 36 row(s) containing missing values (geom_path).
## 
## 
## *** Difference of at least 5 and difference is at least 1%:
## Joining, by = c("state", "name")
##   state                  name newValue oldValue
## 1    AK      positiveIncrease    12523    13535
## 2    CO      positiveIncrease    93398    91570
## 3    FL      positiveIncrease   766305   776249
## 4    NM      positiveIncrease    41040    40168
## 5    NM hospitalizedCurrently    27399    27120
## 6    PR      positiveIncrease    31067    61275
## 7    RI      positiveIncrease    30581    30116
## Rows: 15,241
## Columns: 55
## $ date                        <date> 2020-11-29, 2020-11-29, 2020-11-29, 20...
## $ state                       <chr> "AK", "AL", "AR", "AS", "AZ", "CA", "CO...
## $ positive                    <dbl> 30816, 247229, 156247, 0, 325995, 11989...
## $ probableCases               <dbl> NA, 41286, 19157, NA, 9989, NA, 10126, ...
## $ negative                    <dbl> 975364, 1373770, 1538738, 1988, 1920319...
## $ pending                     <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
## $ totalTestResultsSource      <chr> "totalTestsViral", "totalTestsViral", "...
## $ totalTestResults            <dbl> 1006180, 1579713, 1675828, 1988, 223632...
## $ hospitalizedCurrently       <dbl> 159, 1609, 1030, NA, 2458, 8198, 1847, ...
## $ hospitalizedCumulative      <dbl> 722, 24670, 8843, NA, 25568, NA, 13428,...
## $ inIcuCurrently              <dbl> NA, NA, 390, NA, 573, 1823, NA, NA, 42,...
## $ inIcuCumulative             <dbl> NA, 2234, NA, NA, NA, NA, NA, NA, NA, N...
## $ onVentilatorCurrently       <dbl> 27, NA, 185, NA, 356, NA, NA, NA, 21, N...
## $ onVentilatorCumulative      <dbl> NA, 1287, 975, NA, NA, NA, NA, NA, NA, ...
## $ recovered                   <dbl> 7165, 161946, 136872, NA, 51911, NA, 12...
## $ dataQualityGrade            <chr> "A", "A", "A+", "D", "A+", "B", "A", "C...
## $ lastUpdateEt                <chr> "11/29/2020 03:59", "11/29/2020 11:00",...
## $ dateModified                <dttm> 2020-11-29 03:59:00, 2020-11-29 11:00:...
## $ checkTimeEt                 <chr> "11/28 22:59", "11/29 06:00", "11/28 19...
## $ death                       <dbl> 121, 3577, 2470, 0, 6634, 19121, 2521, ...
## $ hospitalized                <dbl> 722, 24670, 8843, NA, 25568, NA, 13428,...
## $ dateChecked                 <dttm> 2020-11-29 03:59:00, 2020-11-29 11:00:...
## $ totalTestsViral             <dbl> 1006180, 1579713, 1675828, 1988, NA, 23...
## $ positiveTestsViral          <dbl> 38165, NA, NA, NA, NA, NA, NA, NA, NA, ...
## $ negativeTestsViral          <dbl> 966992, NA, 1538738, NA, NA, NA, NA, NA...
## $ positiveCasesViral          <dbl> NA, 205943, 137090, 0, 316006, 1198934,...
## $ deathConfirmed              <dbl> 121, 3245, 2265, NA, 6148, NA, NA, 3981...
## $ deathProbable               <dbl> NA, 332, 205, NA, 486, NA, NA, 980, NA,...
## $ totalTestEncountersViral    <dbl> NA, NA, NA, NA, NA, NA, 3175126, NA, 69...
## $ totalTestsPeopleViral       <dbl> NA, NA, NA, NA, 2236325, NA, 1737952, N...
## $ totalTestsAntibody          <dbl> NA, NA, NA, NA, 363824, NA, 206288, NA,...
## $ positiveTestsAntibody       <dbl> NA, NA, NA, NA, NA, NA, 16879, NA, NA, ...
## $ negativeTestsAntibody       <dbl> NA, NA, NA, NA, NA, NA, 188172, NA, NA,...
## $ totalTestsPeopleAntibody    <dbl> NA, 71698, NA, NA, NA, NA, NA, NA, NA, ...
## $ positiveTestsPeopleAntibody <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
## $ negativeTestsPeopleAntibody <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
## $ totalTestsPeopleAntigen     <dbl> NA, NA, 135709, NA, NA, NA, NA, NA, NA,...
## $ positiveTestsPeopleAntigen  <dbl> NA, NA, 23208, NA, NA, NA, NA, NA, NA, ...
## $ totalTestsAntigen           <dbl> NA, NA, 21856, NA, NA, NA, NA, 49816, N...
## $ positiveTestsAntigen        <dbl> NA, NA, 3300, NA, NA, NA, NA, NA, NA, N...
## $ fips                        <chr> "02", "01", "05", "60", "04", "06", "08...
## $ positiveIncrease            <dbl> 612, 2236, 1221, 0, 3221, 15614, 3489, ...
## $ negativeIncrease            <dbl> 6514, 3978, 9224, 0, 15300, 218705, 904...
## $ total                       <dbl> 1006180, 1620999, 1694985, 1988, 224631...
## $ totalTestResultsIncrease    <dbl> 7126, 5811, 10243, 0, 18441, 234319, 31...
## $ posNeg                      <dbl> 1006180, 1620999, 1694985, 1988, 224631...
## $ deathIncrease               <dbl> 0, 5, 21, 0, 10, 32, 0, 0, 2, 7, 59, 18...
## $ hospitalizedIncrease        <dbl> 1, 0, 24, 0, 220, 0, 59, 0, 0, 0, 116, ...
## $ hash                        <chr> "81a1922227c01f54d1d8cc7e718af55ee8b680...
## $ commercialScore             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ negativeRegularScore        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ negativeScore               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ positiveScore               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ score                       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ grade                       <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
## 
## 
## Control totals - note that validState other than TRUE will be discarded
## 
## # A tibble: 2 x 6
##   validState    cases deaths  hosp     tests     n
##   <lgl>         <dbl>  <dbl> <dbl>     <dbl> <dbl>
## 1 FALSE         60159   1231    NA    488050  1295
## 2 TRUE       13128516 256689    NA 190653499 13946
## Rows: 13,946
## Columns: 6
## $ date   <date> 2020-11-29, 2020-11-29, 2020-11-29, 2020-11-29, 2020-11-29,...
## $ state  <chr> "AK", "AL", "AR", "AZ", "CA", "CO", "CT", "DC", "DE", "FL", ...
## $ cases  <dbl> 612, 2236, 1221, 3221, 15614, 3489, 0, 140, 581, 7131, 1665,...
## $ deaths <dbl> 0, 5, 21, 10, 32, 0, 0, 2, 7, 59, 18, 4, 7, 4, 44, 22, 0, 11...
## $ hosp   <dbl> 159, 1609, 1030, 2458, 8198, 1847, 1017, 145, 211, 4059, 249...
## $ tests  <dbl> 7126, 5811, 10243, 18441, 234319, 31899, 0, 5004, 9274, 8509...
## Rows: 13,946
## Columns: 14
## $ date   <date> 2020-01-22, 2020-01-22, 2020-01-23, 2020-01-23, 2020-01-24,...
## $ state  <chr> "MA", "WA", "MA", "WA", "MA", "WA", "MA", "WA", "MA", "WA", ...
## $ cases  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ deaths <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ hosp   <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
## $ tests  <dbl> 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, ...
## $ cpm    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ dpm    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ hpm    <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
## $ tpm    <dbl> 0.0000000, 0.0000000, 0.1471796, 0.0000000, 0.0000000, 0.000...
## $ cpm7   <dbl> NA, NA, NA, NA, NA, NA, 0, 0, 0, 0, 0, 0, 0, 0, NA, 0, 0, NA...
## $ dpm7   <dbl> NA, NA, NA, NA, NA, NA, 0, 0, 0, 0, 0, 0, 0, 0, NA, 0, 0, NA...
## $ hpm7   <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
## $ tpm7   <dbl> NA, NA, NA, NA, NA, NA, 0.04205130, 0.00000000, 0.06307695, ...
## `summarise()` regrouping output by 'state' (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)

## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` regrouping output by 'date', 'cluster' (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)

## 
## Recency is defined as 2020-10-31 through current
## 
## Recency is defined as 2020-10-31 through current

## `summarise()` regrouping output by 'state', 'cluster', 'date' (override with `.groups` argument)

## `summarise()` ungrouping output (override with `.groups` argument)

## `summarise()` ungrouping output (override with `.groups` argument)

## `summarise()` ungrouping output (override with `.groups` argument)

saveToRDS(test_hier5_201130, ovrWriteError=FALSE)

The state-level segments are compared for general overlap:

stateSegmentChange <- tibble::tibble(state=names(test_hier5_201130$useClusters), 
                                     newSegment=unname(test_hier5_201130$useClusters), 
                                     oldSegment=unname(test_old_201108$useClusters), 
                                     ckState=names(test_old_201108$useClusters)
                                     )

stateSegmentChange %>%
    summarize(wrongState=sum(state != ckState))
## # A tibble: 1 x 1
##   wrongState
##        <int>
## 1          0
stateSegmentChange %>%
    count(oldSegment, newSegment) %>%
    ggplot(aes(x=fct_reorder(factor(oldSegment), n, .fun=sum), 
               y=fct_reorder(factor(newSegment), n, .fun=sum)
               )
           ) + 
    geom_tile(aes(fill=n)) + 
    geom_text(aes(label=n)) + 
    coord_flip() + 
    scale_fill_continuous("# States", low="white", high="green", limits=c(0, NA)) + 
    labs(title="State segment movement", x="Original Segment", y="New Segment")

With the segments able to incorporate the October-November spike, there is significant splitting of segments based on the degree to which they are impacted by the more recent increases.

Further, plots are made for the disease evolution by state for each of the segment overlaps:

stateSegChangePlotData <- test_hier5_201130$consolidatedPlotData %>%
    ungroup() %>%
    select(state, date, name, pop, vpm7) %>%
    filter(!is.na(vpm7)) %>%
    inner_join(select(stateSegmentChange, state, newSegment, oldSegment), by=c("state")) %>%
    bind_rows(mutate(., state="TOTAL")) %>%
    group_by(state, date, name, newSegment, oldSegment) %>%
    summarize(vpm7=sum(vpm7*pop)/sum(pop), pop=sum(pop), .groups="drop")

newSegLevels <- stateSegChangePlotData %>% count(newSegment) %>% arrange(n) %>% pull(newSegment)
oldSegLevels <- stateSegChangePlotData %>% count(oldSegment) %>% arrange(-n) %>% pull(oldSegment)

for (keyVar in c("deaths", "cases", "hosp")) {
    p1 <- stateSegChangePlotData %>%
        mutate(newSegment=factor(newSegment, levels=newSegLevels), 
               oldSegment=factor(oldSegment, levels=oldSegLevels)
               ) %>%
        filter(name==keyVar) %>%
        ggplot(aes(x=date, y=vpm7)) + 
        geom_line(data=~filter(., state != "TOTAL"), aes(group=state), color="grey") +
        geom_line(data=~filter(., state == "TOTAL"), aes(group=1, color=newSegment)) + 
        facet_grid(oldSegment ~ newSegment) + 
        scale_x_date(date_breaks="1 months", date_labels="%b") + 
        scale_color_discrete("New\nSegment") +
        theme(axis.text.x = element_text(angle = 90)) +
        labs(x="", 
             y="Per million (7-day rolling mean)", 
             title=paste0(stringr::str_to_title(keyVar), " per million per day by segment overlap")
             )
    print(p1)
}

New data from USA Facts are downloaded and new county-level segments are created using similar business rules as previous:

# Locations for the population, case, and death file
popLoc <- "./RInputFiles/Coronavirus/covid_county_population_usafacts.csv"
caseLoc <- "./RInputFiles/Coronavirus/covid_confirmed_usafacts_downloaded_20201203.csv"
deathLoc <- "./RInputFiles/Coronavirus/covid_deaths_usafacts_downloaded_20201203.csv"

# Run old segments against new data
cty_new_20201203 <- readRunUSAFacts(maxDate="2020-12-02", 
                                    popLoc=popLoc, 
                                    caseLoc=caseLoc, 
                                    deathLoc=deathLoc, 
                                    dlCaseDeath=!(file.exists(caseLoc) & file.exists(deathLoc)),
                                    oldFile=readFromRDS("cty_20201026")$dfBurden, 
                                    existingStateClusters=test_hier5_201130$useClusters,
                                    createClusters=TRUE,
                                    hierarchical=NA,
                                    minShape=4,
                                    maxShape=11,
                                    ratioDeathvsCase=5,
                                    ratioTotalvsShape=0.25,
                                    minDeath=100,
                                    minCase=5000,
                                    hmlSegs=3,
                                    eslSegs=3,
                                    seed=2012040236, 
                                    orderCluster="dpm"
                                    )
## 
## -- Column specification --------------------------------------------------------
## cols(
##   countyFIPS = col_double(),
##   `County Name` = col_character(),
##   State = col_character(),
##   population = col_double()
## )
## 
## -- Column specification --------------------------------------------------------
## cols(
##   .default = col_double(),
##   `County Name` = col_character(),
##   State = col_character()
## )
## i Use `spec()` for the full column specifications.
## Rows: 1,009,620
## Columns: 6
## $ countyFIPS <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ countyName <chr> "Statewide Unallocated", "Statewide Unallocated", "State...
## $ state      <chr> "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", "A...
## $ stateFIPS  <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...
## $ date       <date> 2020-01-22, 2020-01-23, 2020-01-24, 2020-01-25, 2020-01...
## $ cumCases   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## Warning: `expand_scale()` is deprecated; use `expansion()` instead.
## 
## -- Column specification --------------------------------------------------------
## cols(
##   .default = col_double(),
##   `County Name` = col_character(),
##   State = col_character()
## )
## i Use `spec()` for the full column specifications.
## Rows: 1,009,620
## Columns: 6
## $ countyFIPS <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ countyName <chr> "Statewide Unallocated", "Statewide Unallocated", "State...
## $ state      <chr> "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", "A...
## $ stateFIPS  <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...
## $ date       <date> 2020-01-22, 2020-01-23, 2020-01-24, 2020-01-25, 2020-01...
## $ cumDeaths  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## Warning: `expand_scale()` is deprecated; use `expansion()` instead.
## `geom_smooth()` using formula 'y ~ x'
## `summarise()` ungrouping output (override with `.groups` argument)

## `summarise()` ungrouping output (override with `.groups` argument)
## 
## Shape curves will impose a floor of at least 5000 cases per million
## Shape curves will impose a floor of at least 100 deaths per million
## *** Counties with 0 cases/deaths or that fall below the floor for minCase/minDeath ***
## # A tibble: 1 x 4
##   cpm_mean_is0 dpm_mean_is0 dpm_mean_ltDeath cpm_mean_ltCase
##          <dbl>        <dbl>            <dbl>           <dbl>
## 1            0      0.00377           0.0421         0.00566
## `summarise()` regrouping output by 'state' (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` regrouping output by 'cluster' (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` regrouping output by 'date', 'cluster' (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)

## 
## Recency is defined as 2020-11-03 through current
## 
## Recency is defined as 2020-11-03 through current
## `summarise()` regrouping output by 'cluster' (override with `.groups` argument)
## `summarise()` regrouping output by 'cluster' (override with `.groups` argument)
## `summarise()` regrouping output by 'cluster' (override with `.groups` argument)
## `summarise()` regrouping output by 'cluster' (override with `.groups` argument)

## Warning: `expand_scale()` is deprecated; use `expansion()` instead.

## Joining, by = "fipsCounty"
## Joining, by = "fipsCounty"
## `summarise()` regrouping output by 'cluster' (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)

saveToRDS(cty_new_20201203, ovrWriteError=FALSE)

The overlap of county segments is calculated and plotted:

oldCountySegment <- cty_old_20201120$helperACC_county %>%
    select(state, oldSegment=cluster) %>%
    unique()
countySegmentChange <- cty_new_20201203$helperACC_county %>%
    select(state, newSegment=cluster) %>%
    unique() %>%
    full_join(oldCountySegment, by="state") %>%
    mutate(state=stringr::str_pad(state, width=5, side="left", pad="0"))
countySegmentChange %>%is.na() %>% colSums()
##      state newSegment oldSegment 
##          0          0          0
countySegmentChange %>%
    count(oldSegment, newSegment) %>%
    ggplot(aes(x=fct_reorder(factor(oldSegment), n, .fun=sum), 
               y=fct_reorder(factor(newSegment), n, .fun=sum)
               )
           ) + 
    geom_tile(aes(fill=n)) + 
    geom_text(aes(label=n)) + 
    coord_flip() + 
    scale_fill_continuous("# Counties", low="white", high="green", limits=c(0, NA)) + 
    labs(title="County segment movement", x="Original Segment", y="New Segment")

fisher.test(select(countySegmentChange, oldSegment, newSegment) %>% table(), simulate.p.value=TRUE)
## 
##  Fisher's Exact Test for Count Data with simulated p-value (based on
##  2000 replicates)
## 
## data:  select(countySegmentChange, oldSegment, newSegment) %>% table()
## p-value = 0.0004998
## alternative hypothesis: two.sided
chisq.test(select(countySegmentChange, oldSegment, newSegment) %>% table())
## Warning in chisq.test(select(countySegmentChange, oldSegment, newSegment) %>% :
## Chi-squared approximation may be incorrect
## 
##  Pearson's Chi-squared test
## 
## data:  select(countySegmentChange, oldSegment, newSegment) %>% table()
## X-squared = 1707.8, df = 64, p-value < 2.2e-16

There is meaningful overlap between old county segment and new county segment. Plots of disease burden by county segment overlap are also created:

countySegChangePlotData <- cty_new_20201203$clusterStateData %>%
    ungroup() %>%
    select(fipsCounty, date, countyName, state, pop, cpm7, dpm7) %>%
    filter(!is.na(cpm7), !is.na(dpm7)) %>%
    inner_join(select(countySegmentChange, fipsCounty=state, newSegment, oldSegment), by=c("fipsCounty")) %>%
    bind_rows(mutate(., fipsCounty="TOTAL", state="TOTAL", countyName="TOTAL")) %>%
    group_by(fipsCounty, date, countyName, state, newSegment, oldSegment) %>%
    summarize(cpm7=sum(cpm7*pop)/sum(pop), dpm7=sum(dpm7*pop)/sum(pop), pop=sum(pop), .groups="drop")

newSegLevels <- countySegChangePlotData %>% count(newSegment) %>% arrange(n) %>% pull(newSegment)
oldSegLevels <- countySegChangePlotData %>% count(oldSegment) %>% arrange(-n) %>% pull(oldSegment)

for (keyVar in c("cpm7", "dpm7")) {
    p1 <- countySegChangePlotData %>%
        mutate(newSegment=factor(newSegment, levels=newSegLevels), 
               oldSegment=factor(oldSegment, levels=oldSegLevels)
               ) %>%
        ggplot(aes(x=date, y=get(keyVar))) + 
        # geom_line(data=~filter(., state != "TOTAL"), aes(group=fipsCounty), color="grey") +
        geom_line(data=~filter(., state == "TOTAL"), aes(group=1, color=newSegment)) + 
        facet_grid(oldSegment ~ newSegment) + 
        scale_x_date(date_breaks="1 months", date_labels="%b") + 
        scale_color_discrete("New\nSegment") +
        theme(axis.text.x = element_text(angle = 90)) +
        labs(x="", 
             y="Per million (7-day rolling mean)", 
             title=paste0(stringr::str_to_title(keyVar), " per million per day by segment overlap")
             )
    print(p1)
}

Further, updated CDC data are downloaded:

# Download new data
cdcLoc <- "Weekly_counts_of_deaths_by_jurisdiction_and_age_group_downloaded_20201206.csv"
cdcList_20201206 <- readRunCDCAllCause(loc=cdcLoc, 
                                       startYear=2015, 
                                       curYear=2020,
                                       weekThru=39, 
                                       startWeek=9, 
                                       lst=test_hier5_201130, 
                                       epiMap=readFromRDS("epiMonth"), 
                                       agePopData=readFromRDS("usPopBucket2020"), 
                                       cvDeathThru="2020-09-26", 
                                       cdcPlotStartWeek=10, 
                                       dlData=!file.exists(paste0("./RInputFiles/Coronavirus/", cdcLoc)), 
                                       stateNoCheck=c("NC")
                                       )
## Rows: 180,816
## Columns: 11
## $ Jurisdiction         <chr> "Alabama", "Alabama", "Alabama", "Alabama", "A...
## $ `Week Ending Date`   <chr> "01/10/2015", "01/17/2015", "01/24/2015", "01/...
## $ `State Abbreviation` <chr> "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL"...
## $ Year                 <int> 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015...
## $ Week                 <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,...
## $ `Age Group`          <chr> "25-44 years", "25-44 years", "25-44 years", "...
## $ `Number of Deaths`   <dbl> 67, 49, 55, 59, 47, 59, 41, 47, 59, 57, 54, 50...
## $ `Time Period`        <chr> "2015-2019", "2015-2019", "2015-2019", "2015-2...
## $ Type                 <chr> "Predicted (weighted)", "Predicted (weighted)"...
## $ Suppress             <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## $ Note                 <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## Rows: 180,816
## Columns: 11
## $ Jurisdiction <chr> "Alabama", "Alabama", "Alabama", "Alabama", "Alabama",...
## $ weekEnding   <date> 2015-01-10, 2015-01-17, 2015-01-24, 2015-01-31, 2015-...
## $ state        <chr> "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", ...
## $ year         <int> 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015, ...
## $ week         <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,...
## $ age          <chr> "25-44 years", "25-44 years", "25-44 years", "25-44 ye...
## $ deaths       <dbl> 67, 49, 55, 59, 47, 59, 41, 47, 59, 57, 54, 50, 58, 42...
## $ period       <chr> "2015-2019", "2015-2019", "2015-2019", "2015-2019", "2...
## $ type         <chr> "Predicted (weighted)", "Predicted (weighted)", "Predi...
## $ Suppress     <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## $ Note         <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## 
## Check Control Levels and Record Counts for Renamed Data:
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 6 x 4
##   age                    n n_deaths_na   deaths
##   <chr>              <int>       <int>    <dbl>
## 1 25-44 years        27094           7  3318967
## 2 45-64 years        33072          13 12975960
## 3 65-74 years        33058          14 12893879
## 4 75-84 years        33078          15 16026289
## 5 85 years and older 33066          16 20874940
## 6 Under 25 years     21448           0  1425191
## `summarise()` regrouping output by 'period', 'year' (override with `.groups` argument)
## # A tibble: 12 x 6
## # Groups:   period, year [6]
##    period     year type                     n n_deaths_na  deaths
##    <chr>     <int> <chr>                <int>       <int>   <dbl>
##  1 2015-2019  2015 Predicted (weighted) 15285           0 5416393
##  2 2015-2019  2015 Unweighted           15285           0 5416393
##  3 2015-2019  2016 Predicted (weighted) 15366           0 5483774
##  4 2015-2019  2016 Unweighted           15366           0 5483774
##  5 2015-2019  2017 Predicted (weighted) 15317           0 5643340
##  6 2015-2019  2017 Unweighted           15317           0 5643340
##  7 2015-2019  2018 Predicted (weighted) 15307           0 5698023
##  8 2015-2019  2018 Unweighted           15307           0 5698023
##  9 2015-2019  2019 Predicted (weighted) 15317           0 5725524
## 10 2015-2019  2019 Unweighted           15317           0 5725524
## 11 2020       2020 Predicted (weighted) 13840          38 5840770
## 12 2020       2020 Unweighted           13792          27 5740348
## `summarise()` regrouping output by 'period' (override with `.groups` argument)
## # A tibble: 3 x 5
## # Groups:   period [2]
##   period    Suppress                                       n n_deaths_na  deaths
##   <chr>     <chr>                                      <int>       <int>   <dbl>
## 1 2015-2019 <NA>                                      153184           0  5.59e7
## 2 2020      Suppressed (counts highly incomplete, <5~     65          65  0.    
## 3 2020      <NA>                                       27567           0  1.16e7
## `summarise()` regrouping output by 'period' (override with `.groups` argument)
## # A tibble: 9 x 5
## # Groups:   period [2]
##   period   Note                                            n n_deaths_na  deaths
##   <chr>    <chr>                                       <int>       <int>   <dbl>
## 1 2015-20~ <NA>                                       153184           0  5.59e7
## 2 2020     Data in recent weeks are incomplete. Only~  22570           8  9.90e6
## 3 2020     Data in recent weeks are incomplete. Only~    492           0  2.29e5
## 4 2020     Data in recent weeks are incomplete. Only~    259           0  3.00e4
## 5 2020     Data in recent weeks are incomplete. Only~   1925          57  4.47e5
## 6 2020     Data in recent weeks are incomplete. Only~     24           0  1.31e4
## 7 2020     Estimates for Pennsylvania are too low fo~     48           0  2.26e4
## 8 2020     Weights may be too low to account for und~    168           0  4.54e4
## 9 2020     <NA>                                         2146           0  8.93e5
## `summarise()` regrouping output by 'state' (override with `.groups` argument)
##    state         Jurisdiction    n n_deaths_na   deaths
## 1     US        United States 3684           0 33636341
## 2     CA           California 3684           0  3198495
## 3     FL              Florida 3684           0  2456535
## 4     TX                Texas 3684           0  2424271
## 5     PA         Pennsylvania 3684           0  1612787
## 6     OH                 Ohio 3684           0  1455156
## 7     IL             Illinois 3684           0  1272621
## 8     NY             New York 3684           0  1200005
## 9     MI             Michigan 3684           0  1157271
## 10    NC       North Carolina 3585          33  1072354
## 11    GA              Georgia 3683           0  1009519
## 12    NJ           New Jersey 3678           0   898785
## 13    TN            Tennessee 3684           0   880363
## 14    VA             Virginia 3684           0   808002
## 15    IN              Indiana 3681           0   782822
## 16    MO             Missouri 3682           0   762494
## 17    AZ              Arizona 3684           0   712935
## 18    MA        Massachusetts 3646           0   710364
## 19    YC        New York City 3680           0   693824
## 20    WA           Washington 3684           0   671307
## 21    AL              Alabama 3683           0   624906
## 22    WI            Wisconsin 3664           0   621043
## 23    MD             Maryland 3678           0   594189
## 24    SC       South Carolina 3680           0   586190
## 25    KY             Kentucky 3642           0   569155
## 26    LA            Louisiana 3680           0   549152
## 27    MN            Minnesota 3637           0   526472
## 28    CO             Colorado 3682           0   466081
## 29    OK             Oklahoma 3672           0   465298
## 30    OR               Oregon 3512           0   430304
## 31    MS          Mississippi 3620           0   379957
## 32    AR             Arkansas 3576           0   378866
## 33    CT          Connecticut 3236          19   370453
## 34    IA                 Iowa 3313           0   355740
## 35    PR          Puerto Rico 3392           0   344999
## 36    KS               Kansas 3370           0   310278
## 37    NV               Nevada 3415           0   301998
## 38    WV        West Virginia 3102          13   258872
## 39    UT                 Utah 3571           0   223643
## 40    NM           New Mexico 3251           0   213726
## 41    NE             Nebraska 2960           0   197462
## 42    ME                Maine 2750           0   166838
## 43    ID                Idaho 2877           0   160328
## 44    NH        New Hampshire 2772           0   140545
## 45    HI               Hawaii 2658           0   129455
## 46    RI         Rhode Island 2570           0   118811
## 47    MT              Montana 2656           0   115410
## 48    DE             Delaware 2662           0   103627
## 49    SD         South Dakota 2544           0    91220
## 50    ND         North Dakota 2526           0    78972
## 51    DC District of Columbia 2646           0    66641
## 52    VT              Vermont 2428           0    64407
## 53    WY              Wyoming 2411           0    49654
## 54    AK               Alaska 2453           0    44283
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## Rows: 180,816
## Columns: 11
## $ Jurisdiction <chr> "Alabama", "Alabama", "Alabama", "Alabama", "Alabama",...
## $ weekEnding   <date> 2015-01-10, 2015-01-17, 2015-01-24, 2015-01-31, 2015-...
## $ state        <chr> "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", ...
## $ year         <fct> 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015, ...
## $ week         <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,...
## $ age          <fct> 25-44 years, 25-44 years, 25-44 years, 25-44 years, 25...
## $ deaths       <dbl> 67, 49, 55, 59, 47, 59, 41, 47, 59, 57, 54, 50, 58, 42...
## $ period       <fct> 2015-2019, 2015-2019, 2015-2019, 2015-2019, 2015-2019,...
## $ type         <chr> "Predicted (weighted)", "Predicted (weighted)", "Predi...
## $ Suppress     <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## $ Note         <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## Rows: 88,160
## Columns: 11
## $ Jurisdiction <chr> "Alabama", "Alabama", "Alabama", "Alabama", "Alabama",...
## $ weekEnding   <date> 2015-01-10, 2015-01-17, 2015-01-24, 2015-01-31, 2015-...
## $ state        <chr> "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", ...
## $ year         <fct> 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015, ...
## $ week         <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,...
## $ age          <fct> 25-44 years, 25-44 years, 25-44 years, 25-44 years, 25...
## $ deaths       <dbl> 67, 49, 55, 59, 47, 59, 41, 47, 59, 57, 54, 50, 58, 42...
## $ period       <fct> 2015-2019, 2015-2019, 2015-2019, 2015-2019, 2015-2019,...
## $ type         <chr> "Predicted (weighted)", "Predicted (weighted)", "Predi...
## $ Suppress     <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## $ Note         <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## 
## 
##  *** Data suppression checks *** 
## # A tibble: 14 x 11
##    Jurisdiction weekEnding state year   week age   deaths period type  Suppress
##    <chr>        <date>     <chr> <fct> <int> <fct>  <dbl> <fct>  <chr> <chr>   
##  1 North Carol~ 2020-09-12 NC    2020     37 25-4~     NA 2020   Pred~ Suppres~
##  2 North Carol~ 2020-09-26 NC    2020     39 25-4~     NA 2020   Pred~ Suppres~
##  3 North Carol~ 2020-09-12 NC    2020     37 45-6~     NA 2020   Pred~ Suppres~
##  4 North Carol~ 2020-09-19 NC    2020     38 45-6~     NA 2020   Pred~ Suppres~
##  5 North Carol~ 2020-09-26 NC    2020     39 45-6~     NA 2020   Pred~ Suppres~
##  6 North Carol~ 2020-09-12 NC    2020     37 65-7~     NA 2020   Pred~ Suppres~
##  7 North Carol~ 2020-09-19 NC    2020     38 65-7~     NA 2020   Pred~ Suppres~
##  8 North Carol~ 2020-09-26 NC    2020     39 65-7~     NA 2020   Pred~ Suppres~
##  9 North Carol~ 2020-09-12 NC    2020     37 75-8~     NA 2020   Pred~ Suppres~
## 10 North Carol~ 2020-09-19 NC    2020     38 75-8~     NA 2020   Pred~ Suppres~
## 11 North Carol~ 2020-09-26 NC    2020     39 75-8~     NA 2020   Pred~ Suppres~
## 12 North Carol~ 2020-09-12 NC    2020     37 85 y~     NA 2020   Pred~ Suppres~
## 13 North Carol~ 2020-09-19 NC    2020     38 85 y~     NA 2020   Pred~ Suppres~
## 14 North Carol~ 2020-09-26 NC    2020     39 85 y~     NA 2020   Pred~ Suppres~
## # ... with 1 more variable: Note <chr>
## 
##  *** Data suppression checks failed - total of 14 suppressions
##  *** Of these suppressions, 9 are NOT from weekThru of current year
## Continuing since all states with problems are in stateNoCheck
## `summarise()` regrouping output by 'Jurisdiction', 'weekEnding', 'state', 'year', 'week', 'age', 'period', 'type' (override with `.groups` argument)
## Rows: 82,918
## Columns: 12
## $ Jurisdiction <chr> "Alabama", "Alabama", "Alabama", "Alabama", "Alabama",...
## $ weekEnding   <date> 2015-01-10, 2015-01-10, 2015-01-10, 2015-01-10, 2015-...
## $ state        <chr> "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", ...
## $ year         <fct> 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015, ...
## $ week         <int> 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, ...
## $ age          <fct> Under 25 years, 25-44 years, 45-64 years, 65-74 years,...
## $ period       <fct> 2015-2019, 2015-2019, 2015-2019, 2015-2019, 2015-2019,...
## $ type         <chr> "Predicted (weighted)", "Predicted (weighted)", "Predi...
## $ Suppress     <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## $ n            <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
## $ deaths       <dbl> 25, 67, 253, 202, 272, 320, 28, 49, 256, 222, 253, 332...
## $ Note         <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## 
## First duplicate is in row number (0 means no duplicates): 0
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` regrouping output by 'cluster' (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)

## `summarise()` regrouping output by 'year' (override with `.groups` argument)

## `summarise()` regrouping output by 'year', 'week' (override with `.groups` argument)

## `summarise()` regrouping output by 'year', 'week' (override with `.groups` argument)

## `summarise()` regrouping output by 'year', 'age', 'week' (override with `.groups` argument)

## 
## Plots will be run after excluding stateNoCheck states
## `summarise()` regrouping output by 'year' (override with `.groups` argument)

## `summarise()` ungrouping output (override with `.groups` argument)

## `summarise()` regrouping output by 'year' (override with `.groups` argument)

## `summarise()` regrouping output by 'year' (override with `.groups` argument)

## `summarise()` regrouping output by 'year' (override with `.groups` argument)

## `summarise()` regrouping output by 'year' (override with `.groups` argument)

## `summarise()` regrouping output by 'year' (override with `.groups` argument)

## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'state', 'quarter', 'month' (override with `.groups` argument)
## `summarise()` regrouping output by 'state' (override with `.groups` argument)

## `summarise()` regrouping output by 'state' (override with `.groups` argument)

## `summarise()` ungrouping output (override with `.groups` argument)
## Joining, by = "state"

## `summarise()` regrouping output by 'year' (override with `.groups` argument)

## `summarise()` regrouping output by 'year' (override with `.groups` argument)

## `summarise()` regrouping output by 'year' (override with `.groups` argument)

## `summarise()` regrouping output by 'year' (override with `.groups` argument)

## `summarise()` regrouping output by 'year' (override with `.groups` argument)

## `summarise()` regrouping output by 'year' (override with `.groups` argument)

## `summarise()` regrouping output by 'age', 'quarter', 'month' (override with `.groups` argument)
## `summarise()` regrouping output by 'age' (override with `.groups` argument)

## `summarise()` ungrouping output (override with `.groups` argument)

saveToRDS(cdcList_20201206, ovrWriteError=FALSE)
## 
## File already exists: ./RInputFiles/Coronavirus/cdcList_20201206.RDS 
## 
## Not replacing the existing file since ovrWrite=FALSE
## NULL

A function is written to compare CDC deaths for the same time period in two different files:

cdcDeathCompare <- function(loc1, 
                            loc2, 
                            dir="./RInputFiles/Coronavirus/", 
                            periodKeep="2015-2019", 
                            weekThru=53, 
                            stateNoCheck=c(state.abb, "DC"), 
                            threshPct=0.0005, 
                            returnList=FALSE
                            ) {
    
    # FUNCTION ARGUMENTS:
    # loc1: the first raw CDC file (either character location of CSV or data frame)
    # loc2: the second raw CDC file (either character location of CSV or data frame)
    # dir: the directory containing the raw CDC files
    # periodKeep: keep all data from this time period
    # weekThru: keep all data that is from this week or earlier, even if not in periodKeep
    # stateNoCheck: do not run error checks for these states (all in this case)
    # threshPct: threshold for plotting as a difference
    # returnList: boolean, whether to return a list including both CDC file and the final difference file
    #             if FALSE (default), only the difference file is returned, and it is returned as tibble
    
    # Function to read raw CDC data
    readRawCDC <- function(x) {
        readProcessCDC(x, weekThru=weekThru, periodKeep=periodKeep, fDir=dir, stateNoCheck=stateNoCheck) %>%
            group_by(weekEnding) %>%
            summarize(deaths=sum(deaths, na.rm=TRUE), .groups="drop")
    }
    
    # Read the data if character, otherwise keep "as is"
    if ("character" %in% class(loc1)) cdc1 <- readRawCDC(loc1)
    else cdc1 <- loc1
    if ("character" %in% class(loc2)) cdc2 <- readRawCDC(loc2)
    else cdc2 <- loc2
    
    # Merge the files
    cdc <- select(cdc1, weekEnding, deaths1=deaths) %>%
        full_join(select(cdc2, weekEnding, deaths2=deaths), by="weekEnding")
    
    # Mapping file
    if ("character" %in% class(loc1)) name1 <- stringr::str_extract(loc1, pattern="\\d{8}")
    else name1 <- loc1 %>% pull(weekEnding) %>% max() %>% as.character() %>% stringr::str_replace_all("-", "")
    if ("character" %in% class(loc2)) name2 <- stringr::str_extract(loc2, pattern="\\d{8}")
    else name2 <- loc2 %>% pull(weekEnding) %>% max() %>% as.character() %>% stringr::str_replace_all("-", "")
    mapFile <- c(name1, name2)
    names(mapFile) <- c("deaths1", "deaths2")
    
    
    # Plot differences
    p1 <- cdc %>%
        pivot_longer(-weekEnding) %>%
        mutate(name=mapFile[name]) %>%
        ggplot(aes(x=weekEnding, y=value)) +
        geom_line(aes(group=name, color=name)) + 
        labs(x="", 
             y="Total deaths", 
             title="Change in total deaths (predicted) reported in CDC data by time period"
             ) + 
        scale_color_discrete("CDC Time Period") + 
        ylim(0, NA)
    print(p1)
    
    # Extract any weeks with change greater than threshPct
    p2 <- cdc %>%
        filter(!is.na(deaths1), !is.na(deaths2)) %>%
        mutate(delta=deaths2-deaths1) %>%
        filter(abs(delta) >= threshPct*pmax(deaths1, deaths2)) %>%
        ggplot(aes(x=weekEnding, y=delta)) + 
        geom_col(aes(fill=delta>0)) + 
        geom_text(aes(y=delta+100, label=delta), hjust=0) +
        labs(x="", 
             y="Delta between files (positive means more in second file)", 
             title="Change in CDC reported deaths by week", 
             subtitle=paste0("Includes only weeks with change at least ", round(threshPct*100, 3), "%\n", 
                             "Dates: ", name1," and ", name2
                             )
             ) + 
        coord_flip() + 
        scale_fill_discrete("Positive Delta")
    print(p2)
    
    # Return the CDC data
    if (returnList) list(cdc=cdc, cdc1=cdc1, cdc2=cdc2)
    else cdc
    
}

loc1 <- "Weekly_counts_of_deaths_by_jurisdiction_and_age_group_downloaded_20200923.csv"
loc2 <- "Weekly_counts_of_deaths_by_jurisdiction_and_age_group_downloaded_20201206.csv"

cdcCompare_0923_1206 <- cdcDeathCompare(loc1=loc1, loc2=loc2, returnList=TRUE)
## Rows: 174,311
## Columns: 11
## $ Jurisdiction         <chr> "Alabama", "Alabama", "Alabama", "Alabama", "A...
## $ `Week Ending Date`   <chr> "1/10/2015", "1/17/2015", "1/24/2015", "1/31/2...
## $ `State Abbreviation` <chr> "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL"...
## $ Year                 <int> 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015...
## $ Week                 <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,...
## $ `Age Group`          <chr> "25-44 years", "25-44 years", "25-44 years", "...
## $ `Number of Deaths`   <dbl> 67, 49, 55, 59, 47, 59, 41, 47, 59, 57, 54, 50...
## $ `Time Period`        <chr> "2015-2019", "2015-2019", "2015-2019", "2015-2...
## $ Type                 <chr> "Predicted (weighted)", "Predicted (weighted)"...
## $ Suppress             <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## $ Note                 <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## Rows: 174,311
## Columns: 11
## $ Jurisdiction <chr> "Alabama", "Alabama", "Alabama", "Alabama", "Alabama",...
## $ weekEnding   <date> 2015-01-10, 2015-01-17, 2015-01-24, 2015-01-31, 2015-...
## $ state        <chr> "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", ...
## $ year         <int> 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015, ...
## $ week         <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,...
## $ age          <chr> "25-44 years", "25-44 years", "25-44 years", "25-44 ye...
## $ deaths       <dbl> 67, 49, 55, 59, 47, 59, 41, 47, 59, 57, 54, 50, 58, 42...
## $ period       <chr> "2015-2019", "2015-2019", "2015-2019", "2015-2019", "2...
## $ type         <chr> "Predicted (weighted)", "Predicted (weighted)", "Predi...
## $ Suppress     <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## $ Note         <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## 
## Check Control Levels and Record Counts for Renamed Data:
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 6 x 4
##   age                    n n_deaths_na   deaths
##   <chr>              <int>       <int>    <dbl>
## 1 25-44 years        26066           3  3170502
## 2 45-64 years        31884           7 12464677
## 3 65-74 years        31880          10 12347549
## 4 75-84 years        31902          11 15363924
## 5 85 years and older 31888           9 20067928
## 6 Under 25 years     20691           0  1372225
## `summarise()` regrouping output by 'period', 'year' (override with `.groups` argument)
## # A tibble: 12 x 6
## # Groups:   period, year [6]
##    period     year type                     n n_deaths_na  deaths
##    <chr>     <int> <chr>                <int>       <int>   <dbl>
##  1 2015-2019  2015 Predicted (weighted) 15285           0 5416393
##  2 2015-2019  2015 Unweighted           15285           0 5416393
##  3 2015-2019  2016 Predicted (weighted) 15365           0 5483764
##  4 2015-2019  2016 Unweighted           15365           0 5483764
##  5 2015-2019  2017 Predicted (weighted) 15318           0 5643350
##  6 2015-2019  2017 Unweighted           15318           0 5643350
##  7 2015-2019  2018 Predicted (weighted) 15305           0 5698002
##  8 2015-2019  2018 Unweighted           15305           0 5698002
##  9 2015-2019  2019 Predicted (weighted) 15319           0 5725516
## 10 2015-2019  2019 Unweighted           15319           0 5725516
## 11 2020       2020 Predicted (weighted) 10586          24 4476340
## 12 2020       2020 Unweighted           10541          16 4376415
## `summarise()` regrouping output by 'period' (override with `.groups` argument)
## # A tibble: 3 x 5
## # Groups:   period [2]
##   period    Suppress                                       n n_deaths_na  deaths
##   <chr>     <chr>                                      <int>       <int>   <dbl>
## 1 2015-2019 <NA>                                      153184           0  5.59e7
## 2 2020      Suppressed (counts highly incomplete, <5~     40          40  0.    
## 3 2020      <NA>                                       21087           0  8.85e6
## `summarise()` regrouping output by 'period' (override with `.groups` argument)
## # A tibble: 9 x 5
## # Groups:   period [2]
##   period   Note                                            n n_deaths_na  deaths
##   <chr>    <chr>                                       <int>       <int>   <dbl>
## 1 2015-20~ <NA>                                       153184           0  5.59e7
## 2 2020     Data in recent weeks are incomplete. Only~  16591           0  7.27e6
## 3 2020     Data in recent weeks are incomplete. Only~    324           0  1.52e5
## 4 2020     Data in recent weeks are incomplete. Only~    288          30  3.05e4
## 5 2020     Data in recent weeks are incomplete. Only~   1502          10  4.17e5
## 6 2020     Data in recent weeks are incomplete. Only~     60           0  2.71e4
## 7 2020     Estimates for Pennsylvania are too low fo~     48           0  2.23e4
## 8 2020     Weights may be too low to account for und~    436           0  1.40e5
## 9 2020     <NA>                                         1878           0  7.97e5
## `summarise()` regrouping output by 'state' (override with `.groups` argument)
##    state         Jurisdiction    n n_deaths_na   deaths
## 1     US        United States 3552           0 32276762
## 2     CA           California 3552           0  3072016
## 3     FL              Florida 3552           0  2357528
## 4     TX                Texas 3552           0  2314502
## 5     PA         Pennsylvania 3552           0  1548716
## 6     OH                 Ohio 3552           0  1395014
## 7     IL             Illinois 3552           0  1219119
## 8     NY             New York 3552           0  1155790
## 9     MI             Michigan 3552           0  1111203
## 10    NC       North Carolina 3521          17  1051632
## 11    GA              Georgia 3551           0   965504
## 12    NJ           New Jersey 3546           0   867210
## 13    TN            Tennessee 3552           0   840787
## 14    VA             Virginia 3552           0   774383
## 15    IN              Indiana 3550           0   749760
## 16    MO             Missouri 3548           0   728220
## 17    MA        Massachusetts 3516           0   685409
## 18    AZ              Arizona 3552           0   684537
## 19    YC        New York City 3548           0   671106
## 20    WA           Washington 3551           0   645406
## 21    AL              Alabama 3550           0   598526
## 22    WI            Wisconsin 3533           0   592047
## 23    MD             Maryland 3546           0   570238
## 24    SC       South Carolina 3549           0   560415
## 25    KY             Kentucky 3519           0   545032
## 26    LA            Louisiana 3545           0   525668
## 27    MN            Minnesota 3509           0   503567
## 28    CO             Colorado 3550           0   446708
## 29    OK             Oklahoma 3541           0   445362
## 30    OR               Oregon 3382           0   413553
## 31    MS          Mississippi 3488           0   363792
## 32    AR             Arkansas 3444           0   361612
## 33    CT          Connecticut 3106          13   356416
## 34    IA                 Iowa 3190           0   339791
## 35    PR          Puerto Rico 3272           0   331654
## 36    KS               Kansas 3246           0   296520
## 37    NV               Nevada 3291           0   289275
## 38    WV        West Virginia 3011          10   251046
## 39    UT                 Utah 3438           0   213931
## 40    NM           New Mexico 3140           0   205026
## 41    NE             Nebraska 2846           0   188492
## 42    ME                Maine 2646           0   160341
## 43    ID                Idaho 2766           0   152936
## 44    NH        New Hampshire 2672           0   135110
## 45    HI               Hawaii 2556           0   124379
## 46    RI         Rhode Island 2474           0   114274
## 47    MT              Montana 2556           0   109719
## 48    DE             Delaware 2558           0    99625
## 49    SD         South Dakota 2448           0    86461
## 50    ND         North Dakota 2433           0    75079
## 51    DC District of Columbia 2545           0    64202
## 52    VT              Vermont 2336           0    61925
## 53    WY              Wyoming 2318           0    47230
## 54    AK               Alaska 2352           0    42249
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## Rows: 174,311
## Columns: 11
## $ Jurisdiction <chr> "Alabama", "Alabama", "Alabama", "Alabama", "Alabama",...
## $ weekEnding   <date> 2015-01-10, 2015-01-17, 2015-01-24, 2015-01-31, 2015-...
## $ state        <chr> "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", ...
## $ year         <fct> 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015, ...
## $ week         <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,...
## $ age          <fct> 25-44 years, 25-44 years, 25-44 years, 25-44 years, 25...
## $ deaths       <dbl> 67, 49, 55, 59, 47, 59, 41, 47, 59, 57, 54, 50, 58, 42...
## $ period       <fct> 2015-2019, 2015-2019, 2015-2019, 2015-2019, 2015-2019,...
## $ type         <chr> "Predicted (weighted)", "Predicted (weighted)", "Predi...
## $ Suppress     <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## $ Note         <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## Rows: 87,178
## Columns: 11
## $ Jurisdiction <chr> "Alabama", "Alabama", "Alabama", "Alabama", "Alabama",...
## $ weekEnding   <date> 2015-01-10, 2015-01-17, 2015-01-24, 2015-01-31, 2015-...
## $ state        <chr> "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", ...
## $ year         <fct> 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015, ...
## $ week         <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,...
## $ age          <fct> 25-44 years, 25-44 years, 25-44 years, 25-44 years, 25...
## $ deaths       <dbl> 67, 49, 55, 59, 47, 59, 41, 47, 59, 57, 54, 50, 58, 42...
## $ period       <fct> 2015-2019, 2015-2019, 2015-2019, 2015-2019, 2015-2019,...
## $ type         <chr> "Predicted (weighted)", "Predicted (weighted)", "Predi...
## $ Suppress     <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## $ Note         <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## 
## 
##  *** Data suppression checks *** 
## # A tibble: 24 x 11
##    Jurisdiction weekEnding state year   week age   deaths period type  Suppress
##    <chr>        <date>     <chr> <fct> <int> <fct>  <dbl> <fct>  <chr> <chr>   
##  1 Connecticut  2020-08-15 CT    2020     33 45-6~     NA 2020   Pred~ Suppres~
##  2 Connecticut  2020-08-22 CT    2020     34 45-6~     NA 2020   Pred~ Suppres~
##  3 Connecticut  2020-08-15 CT    2020     33 65-7~     NA 2020   Pred~ Suppres~
##  4 Connecticut  2020-08-22 CT    2020     34 65-7~     NA 2020   Pred~ Suppres~
##  5 Connecticut  2020-08-15 CT    2020     33 75-8~     NA 2020   Pred~ Suppres~
##  6 Connecticut  2020-08-22 CT    2020     34 75-8~     NA 2020   Pred~ Suppres~
##  7 Connecticut  2020-08-15 CT    2020     33 85 y~     NA 2020   Pred~ Suppres~
##  8 North Carol~ 2020-08-15 NC    2020     33 25-4~     NA 2020   Pred~ Suppres~
##  9 North Carol~ 2020-08-15 NC    2020     33 45-6~     NA 2020   Pred~ Suppres~
## 10 North Carol~ 2020-08-22 NC    2020     34 45-6~     NA 2020   Pred~ Suppres~
## # ... with 14 more rows, and 1 more variable: Note <chr>
## 
##  *** Data suppression checks failed - total of 24 suppressions
##  *** Of these suppressions, 24 are NOT from weekThru of current year
## Continuing since all states with problems are in stateNoCheck
## `summarise()` regrouping output by 'Jurisdiction', 'weekEnding', 'state', 'year', 'week', 'age', 'period', 'type' (override with `.groups` argument)
## Rows: 81,990
## Columns: 12
## $ Jurisdiction <chr> "Alabama", "Alabama", "Alabama", "Alabama", "Alabama",...
## $ weekEnding   <date> 2015-01-10, 2015-01-10, 2015-01-10, 2015-01-10, 2015-...
## $ state        <chr> "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", ...
## $ year         <fct> 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015, ...
## $ week         <int> 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, ...
## $ age          <fct> Under 25 years, 25-44 years, 45-64 years, 65-74 years,...
## $ period       <fct> 2015-2019, 2015-2019, 2015-2019, 2015-2019, 2015-2019,...
## $ type         <chr> "Predicted (weighted)", "Predicted (weighted)", "Predi...
## $ Suppress     <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## $ n            <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
## $ deaths       <dbl> 25, 67, 253, 202, 272, 320, 28, 49, 256, 222, 253, 332...
## $ Note         <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## 
## First duplicate is in row number (0 means no duplicates): 0Rows: 180,816
## Columns: 11
## $ Jurisdiction         <chr> "Alabama", "Alabama", "Alabama", "Alabama", "A...
## $ `Week Ending Date`   <chr> "01/10/2015", "01/17/2015", "01/24/2015", "01/...
## $ `State Abbreviation` <chr> "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL"...
## $ Year                 <int> 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015...
## $ Week                 <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,...
## $ `Age Group`          <chr> "25-44 years", "25-44 years", "25-44 years", "...
## $ `Number of Deaths`   <dbl> 67, 49, 55, 59, 47, 59, 41, 47, 59, 57, 54, 50...
## $ `Time Period`        <chr> "2015-2019", "2015-2019", "2015-2019", "2015-2...
## $ Type                 <chr> "Predicted (weighted)", "Predicted (weighted)"...
## $ Suppress             <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## $ Note                 <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## Rows: 180,816
## Columns: 11
## $ Jurisdiction <chr> "Alabama", "Alabama", "Alabama", "Alabama", "Alabama",...
## $ weekEnding   <date> 2015-01-10, 2015-01-17, 2015-01-24, 2015-01-31, 2015-...
## $ state        <chr> "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", ...
## $ year         <int> 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015, ...
## $ week         <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,...
## $ age          <chr> "25-44 years", "25-44 years", "25-44 years", "25-44 ye...
## $ deaths       <dbl> 67, 49, 55, 59, 47, 59, 41, 47, 59, 57, 54, 50, 58, 42...
## $ period       <chr> "2015-2019", "2015-2019", "2015-2019", "2015-2019", "2...
## $ type         <chr> "Predicted (weighted)", "Predicted (weighted)", "Predi...
## $ Suppress     <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## $ Note         <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## 
## Check Control Levels and Record Counts for Renamed Data:
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 6 x 4
##   age                    n n_deaths_na   deaths
##   <chr>              <int>       <int>    <dbl>
## 1 25-44 years        27094           7  3318967
## 2 45-64 years        33072          13 12975960
## 3 65-74 years        33058          14 12893879
## 4 75-84 years        33078          15 16026289
## 5 85 years and older 33066          16 20874940
## 6 Under 25 years     21448           0  1425191
## `summarise()` regrouping output by 'period', 'year' (override with `.groups` argument)
## # A tibble: 12 x 6
## # Groups:   period, year [6]
##    period     year type                     n n_deaths_na  deaths
##    <chr>     <int> <chr>                <int>       <int>   <dbl>
##  1 2015-2019  2015 Predicted (weighted) 15285           0 5416393
##  2 2015-2019  2015 Unweighted           15285           0 5416393
##  3 2015-2019  2016 Predicted (weighted) 15366           0 5483774
##  4 2015-2019  2016 Unweighted           15366           0 5483774
##  5 2015-2019  2017 Predicted (weighted) 15317           0 5643340
##  6 2015-2019  2017 Unweighted           15317           0 5643340
##  7 2015-2019  2018 Predicted (weighted) 15307           0 5698023
##  8 2015-2019  2018 Unweighted           15307           0 5698023
##  9 2015-2019  2019 Predicted (weighted) 15317           0 5725524
## 10 2015-2019  2019 Unweighted           15317           0 5725524
## 11 2020       2020 Predicted (weighted) 13840          38 5840770
## 12 2020       2020 Unweighted           13792          27 5740348
## `summarise()` regrouping output by 'period' (override with `.groups` argument)
## # A tibble: 3 x 5
## # Groups:   period [2]
##   period    Suppress                                       n n_deaths_na  deaths
##   <chr>     <chr>                                      <int>       <int>   <dbl>
## 1 2015-2019 <NA>                                      153184           0  5.59e7
## 2 2020      Suppressed (counts highly incomplete, <5~     65          65  0.    
## 3 2020      <NA>                                       27567           0  1.16e7
## `summarise()` regrouping output by 'period' (override with `.groups` argument)
## # A tibble: 9 x 5
## # Groups:   period [2]
##   period   Note                                            n n_deaths_na  deaths
##   <chr>    <chr>                                       <int>       <int>   <dbl>
## 1 2015-20~ <NA>                                       153184           0  5.59e7
## 2 2020     Data in recent weeks are incomplete. Only~  22570           8  9.90e6
## 3 2020     Data in recent weeks are incomplete. Only~    492           0  2.29e5
## 4 2020     Data in recent weeks are incomplete. Only~    259           0  3.00e4
## 5 2020     Data in recent weeks are incomplete. Only~   1925          57  4.47e5
## 6 2020     Data in recent weeks are incomplete. Only~     24           0  1.31e4
## 7 2020     Estimates for Pennsylvania are too low fo~     48           0  2.26e4
## 8 2020     Weights may be too low to account for und~    168           0  4.54e4
## 9 2020     <NA>                                         2146           0  8.93e5
## `summarise()` regrouping output by 'state' (override with `.groups` argument)
##    state         Jurisdiction    n n_deaths_na   deaths
## 1     US        United States 3684           0 33636341
## 2     CA           California 3684           0  3198495
## 3     FL              Florida 3684           0  2456535
## 4     TX                Texas 3684           0  2424271
## 5     PA         Pennsylvania 3684           0  1612787
## 6     OH                 Ohio 3684           0  1455156
## 7     IL             Illinois 3684           0  1272621
## 8     NY             New York 3684           0  1200005
## 9     MI             Michigan 3684           0  1157271
## 10    NC       North Carolina 3585          33  1072354
## 11    GA              Georgia 3683           0  1009519
## 12    NJ           New Jersey 3678           0   898785
## 13    TN            Tennessee 3684           0   880363
## 14    VA             Virginia 3684           0   808002
## 15    IN              Indiana 3681           0   782822
## 16    MO             Missouri 3682           0   762494
## 17    AZ              Arizona 3684           0   712935
## 18    MA        Massachusetts 3646           0   710364
## 19    YC        New York City 3680           0   693824
## 20    WA           Washington 3684           0   671307
## 21    AL              Alabama 3683           0   624906
## 22    WI            Wisconsin 3664           0   621043
## 23    MD             Maryland 3678           0   594189
## 24    SC       South Carolina 3680           0   586190
## 25    KY             Kentucky 3642           0   569155
## 26    LA            Louisiana 3680           0   549152
## 27    MN            Minnesota 3637           0   526472
## 28    CO             Colorado 3682           0   466081
## 29    OK             Oklahoma 3672           0   465298
## 30    OR               Oregon 3512           0   430304
## 31    MS          Mississippi 3620           0   379957
## 32    AR             Arkansas 3576           0   378866
## 33    CT          Connecticut 3236          19   370453
## 34    IA                 Iowa 3313           0   355740
## 35    PR          Puerto Rico 3392           0   344999
## 36    KS               Kansas 3370           0   310278
## 37    NV               Nevada 3415           0   301998
## 38    WV        West Virginia 3102          13   258872
## 39    UT                 Utah 3571           0   223643
## 40    NM           New Mexico 3251           0   213726
## 41    NE             Nebraska 2960           0   197462
## 42    ME                Maine 2750           0   166838
## 43    ID                Idaho 2877           0   160328
## 44    NH        New Hampshire 2772           0   140545
## 45    HI               Hawaii 2658           0   129455
## 46    RI         Rhode Island 2570           0   118811
## 47    MT              Montana 2656           0   115410
## 48    DE             Delaware 2662           0   103627
## 49    SD         South Dakota 2544           0    91220
## 50    ND         North Dakota 2526           0    78972
## 51    DC District of Columbia 2646           0    66641
## 52    VT              Vermont 2428           0    64407
## 53    WY              Wyoming 2411           0    49654
## 54    AK               Alaska 2453           0    44283
## `summarise()` regrouping output by 'year' (override with `.groups` argument)

## Rows: 180,816
## Columns: 11
## $ Jurisdiction <chr> "Alabama", "Alabama", "Alabama", "Alabama", "Alabama",...
## $ weekEnding   <date> 2015-01-10, 2015-01-17, 2015-01-24, 2015-01-31, 2015-...
## $ state        <chr> "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", ...
## $ year         <fct> 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015, ...
## $ week         <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,...
## $ age          <fct> 25-44 years, 25-44 years, 25-44 years, 25-44 years, 25...
## $ deaths       <dbl> 67, 49, 55, 59, 47, 59, 41, 47, 59, 57, 54, 50, 58, 42...
## $ period       <fct> 2015-2019, 2015-2019, 2015-2019, 2015-2019, 2015-2019,...
## $ type         <chr> "Predicted (weighted)", "Predicted (weighted)", "Predi...
## $ Suppress     <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## $ Note         <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## Rows: 90,432
## Columns: 11
## $ Jurisdiction <chr> "Alabama", "Alabama", "Alabama", "Alabama", "Alabama",...
## $ weekEnding   <date> 2015-01-10, 2015-01-17, 2015-01-24, 2015-01-31, 2015-...
## $ state        <chr> "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", ...
## $ year         <fct> 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015, ...
## $ week         <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,...
## $ age          <fct> 25-44 years, 25-44 years, 25-44 years, 25-44 years, 25...
## $ deaths       <dbl> 67, 49, 55, 59, 47, 59, 41, 47, 59, 57, 54, 50, 58, 42...
## $ period       <fct> 2015-2019, 2015-2019, 2015-2019, 2015-2019, 2015-2019,...
## $ type         <chr> "Predicted (weighted)", "Predicted (weighted)", "Predi...
## $ Suppress     <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## $ Note         <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## 
## 
##  *** Data suppression checks *** 
## # A tibble: 38 x 11
##    Jurisdiction weekEnding state year   week age   deaths period type  Suppress
##    <chr>        <date>     <chr> <fct> <int> <fct>  <dbl> <fct>  <chr> <chr>   
##  1 Connecticut  2020-11-07 CT    2020     45 25-4~     NA 2020   Pred~ Suppres~
##  2 Connecticut  2020-11-14 CT    2020     46 25-4~     NA 2020   Pred~ Suppres~
##  3 Connecticut  2020-11-07 CT    2020     45 45-6~     NA 2020   Pred~ Suppres~
##  4 Connecticut  2020-11-14 CT    2020     46 45-6~     NA 2020   Pred~ Suppres~
##  5 Connecticut  2020-11-21 CT    2020     47 45-6~     NA 2020   Pred~ Suppres~
##  6 Connecticut  2020-11-07 CT    2020     45 65-7~     NA 2020   Pred~ Suppres~
##  7 Connecticut  2020-11-14 CT    2020     46 65-7~     NA 2020   Pred~ Suppres~
##  8 Connecticut  2020-11-07 CT    2020     45 75-8~     NA 2020   Pred~ Suppres~
##  9 Connecticut  2020-11-14 CT    2020     46 75-8~     NA 2020   Pred~ Suppres~
## 10 Connecticut  2020-11-21 CT    2020     47 75-8~     NA 2020   Pred~ Suppres~
## # ... with 28 more rows, and 1 more variable: Note <chr>
## 
##  *** Data suppression checks failed - total of 38 suppressions
##  *** Of these suppressions, 38 are NOT from weekThru of current year
## Continuing since all states with problems are in stateNoCheck
## `summarise()` regrouping output by 'Jurisdiction', 'weekEnding', 'state', 'year', 'week', 'age', 'period', 'type' (override with `.groups` argument)

## Rows: 85,052
## Columns: 12
## $ Jurisdiction <chr> "Alabama", "Alabama", "Alabama", "Alabama", "Alabama",...
## $ weekEnding   <date> 2015-01-10, 2015-01-10, 2015-01-10, 2015-01-10, 2015-...
## $ state        <chr> "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", ...
## $ year         <fct> 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015, ...
## $ week         <int> 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, ...
## $ age          <fct> Under 25 years, 25-44 years, 45-64 years, 65-74 years,...
## $ period       <fct> 2015-2019, 2015-2019, 2015-2019, 2015-2019, 2015-2019,...
## $ type         <chr> "Predicted (weighted)", "Predicted (weighted)", "Predi...
## $ Suppress     <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## $ n            <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
## $ deaths       <dbl> 25, 67, 253, 202, 272, 320, 28, 49, 256, 222, 253, 332...
## $ Note         <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## 
## First duplicate is in row number (0 means no duplicates): 0
## Warning: Removed 11 row(s) containing missing values (geom_path).

str(cdcCompare_0923_1206)
## List of 3
##  $ cdc : tibble [307 x 3] (S3: tbl_df/tbl/data.frame)
##   ..$ weekEnding: Date[1:307], format: "2015-01-10" "2015-01-17" ...
##   ..$ deaths1   : num [1:307] 61684 61061 58613 57206 57205 ...
##   ..$ deaths2   : num [1:307] 61685 61061 58604 57214 57205 ...
##  $ cdc1: tibble [296 x 2] (S3: tbl_df/tbl/data.frame)
##   ..$ weekEnding: Date[1:296], format: "2015-01-10" "2015-01-17" ...
##   ..$ deaths    : num [1:296] 61684 61061 58613 57206 57205 ...
##  $ cdc2: tibble [307 x 2] (S3: tbl_df/tbl/data.frame)
##   ..$ weekEnding: Date[1:307], format: "2015-01-10" "2015-01-17" ...
##   ..$ deaths    : num [1:307] 61685 61061 58604 57214 57205 ...

There is eventual modest restatement of CDC data in most weeks, and more significant restatement of CDC data in the four most recent weeks of the September 23, 2020 data file (last week available week ending September 5, last mostly non-restated data week ending August 8).

The function is updated to return either a list (includes each CDC data file) or just the differences file.

The function is run again for two different time periods:

loc3 <- "Weekly_counts_of_deaths_by_jurisdiction_and_age_group_downloaded_20201014.csv"
loc4 <- "Weekly_counts_of_deaths_by_jurisdiction_and_age_group_downloaded_20201120.csv"

cdcCompare_1014_1120 <- cdcDeathCompare(loc1=loc3, loc2=loc4, returnList=TRUE)
## Rows: 176,119
## Columns: 11
## $ Jurisdiction         <chr> "Alabama", "Alabama", "Alabama", "Alabama", "A...
## $ `Week Ending Date`   <chr> "01/10/2015", "01/17/2015", "01/24/2015", "01/...
## $ `State Abbreviation` <chr> "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL"...
## $ Year                 <int> 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015...
## $ Week                 <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,...
## $ `Age Group`          <chr> "25-44 years", "25-44 years", "25-44 years", "...
## $ `Number of Deaths`   <dbl> 67, 49, 55, 59, 47, 59, 41, 47, 59, 57, 54, 50...
## $ `Time Period`        <chr> "2015-2019", "2015-2019", "2015-2019", "2015-2...
## $ Type                 <chr> "Predicted (weighted)", "Predicted (weighted)"...
## $ Suppress             <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## $ Note                 <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## Rows: 176,119
## Columns: 11
## $ Jurisdiction <chr> "Alabama", "Alabama", "Alabama", "Alabama", "Alabama",...
## $ weekEnding   <date> 2015-01-10, 2015-01-17, 2015-01-24, 2015-01-31, 2015-...
## $ state        <chr> "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", ...
## $ year         <int> 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015, ...
## $ week         <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,...
## $ age          <chr> "25-44 years", "25-44 years", "25-44 years", "25-44 ye...
## $ deaths       <dbl> 67, 49, 55, 59, 47, 59, 41, 47, 59, 57, 54, 50, 58, 42...
## $ period       <chr> "2015-2019", "2015-2019", "2015-2019", "2015-2019", "2...
## $ type         <chr> "Predicted (weighted)", "Predicted (weighted)", "Predi...
## $ Suppress     <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## $ Note         <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## 
## Check Control Levels and Record Counts for Renamed Data:
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 6 x 4
##   age                    n n_deaths_na   deaths
##   <chr>              <int>       <int>    <dbl>
## 1 25-44 years        26364           0  3214460
## 2 45-64 years        32211           4 12612924
## 3 65-74 years        32200           5 12500875
## 4 75-84 years        32221           6 15546198
## 5 85 years and older 32210           7 20287067
## 6 Under 25 years     20913           0  1388384
## `summarise()` regrouping output by 'period', 'year' (override with `.groups` argument)
## # A tibble: 12 x 6
## # Groups:   period, year [6]
##    period     year type                     n n_deaths_na  deaths
##    <chr>     <int> <chr>                <int>       <int>   <dbl>
##  1 2015-2019  2015 Predicted (weighted) 15285           0 5416393
##  2 2015-2019  2015 Unweighted           15285           0 5416393
##  3 2015-2019  2016 Predicted (weighted) 15365           0 5483764
##  4 2015-2019  2016 Unweighted           15365           0 5483764
##  5 2015-2019  2017 Predicted (weighted) 15319           0 5643363
##  6 2015-2019  2017 Unweighted           15319           0 5643363
##  7 2015-2019  2018 Predicted (weighted) 15305           0 5698004
##  8 2015-2019  2018 Unweighted           15305           0 5698004
##  9 2015-2019  2019 Predicted (weighted) 15317           0 5725524
## 10 2015-2019  2019 Unweighted           15317           0 5725524
## 11 2020       2020 Predicted (weighted) 11490          13 4858272
## 12 2020       2020 Unweighted           11447           9 4757540
## `summarise()` regrouping output by 'period' (override with `.groups` argument)
## # A tibble: 3 x 5
## # Groups:   period [2]
##   period    Suppress                                       n n_deaths_na  deaths
##   <chr>     <chr>                                      <int>       <int>   <dbl>
## 1 2015-2019 <NA>                                      153182           0  5.59e7
## 2 2020      Suppressed (counts highly incomplete, <5~     22          22  0.    
## 3 2020      <NA>                                       22915           0  9.62e6
## `summarise()` regrouping output by 'period' (override with `.groups` argument)
## # A tibble: 10 x 5
## # Groups:   period [2]
##    period   Note                                           n n_deaths_na  deaths
##    <chr>    <chr>                                      <int>       <int>   <dbl>
##  1 2015-20~ <NA>                                      153182           0  5.59e7
##  2 2020     Data in recent weeks are incomplete. Onl~  18444           0  8.13e6
##  3 2020     Data in recent weeks are incomplete. Onl~    360           0  1.68e5
##  4 2020     Data in recent weeks are incomplete. Onl~    382          19  4.45e4
##  5 2020     Data in recent weeks are incomplete. Onl~   1329           3  2.88e5
##  6 2020     Data in recent weeks are incomplete. Onl~     60           0  2.79e4
##  7 2020     Estimates for Pennsylvania are too low f~     36           0  1.68e4
##  8 2020     Weights may be too low to account for un~    284           0  7.19e4
##  9 2020     Weights may be too low to account for un~     12           0  5.72e3
## 10 2020     <NA>                                        2030           0  8.65e5
## `summarise()` regrouping output by 'state' (override with `.groups` argument)
##    state         Jurisdiction    n n_deaths_na   deaths
## 1     US        United States 3588           0 32655734
## 2     CA           California 3588           0  3108728
## 3     FL              Florida 3588           0  2387134
## 4     TX                Texas 3588           0  2348924
## 5     PA         Pennsylvania 3588           0  1566142
## 6     OH                 Ohio 3588           0  1411315
## 7     IL             Illinois 3588           0  1232847
## 8     NY             New York 3588           0  1167718
## 9     MI             Michigan 3588           0  1123550
## 10    NC       North Carolina 3536           8  1059363
## 11    GA              Georgia 3588           0   978193
## 12    NJ           New Jersey 3580           0   875776
## 13    TN            Tennessee 3588           0   851531
## 14    VA             Virginia 3588           0   782913
## 15    IN              Indiana 3587           0   758488
## 16    MO             Missouri 3586           0   737546
## 17    AZ              Arizona 3588           0   692460
## 18    MA        Massachusetts 3554           0   692109
## 19    YC        New York City 3584           0   676973
## 20    WA           Washington 3588           0   652928
## 21    AL              Alabama 3585           0   606105
## 22    WI            Wisconsin 3570           0   598878
## 23    MD             Maryland 3582           0   576822
## 24    SC       South Carolina 3586           0   568254
## 25    KY             Kentucky 3553           0   552088
## 26    LA            Louisiana 3577           0   532758
## 27    MN            Minnesota 3547           0   509530
## 28    CO             Colorado 3586           0   451948
## 29    OK             Oklahoma 3576           0   450316
## 30    OR               Oregon 3418           0   418422
## 31    MS          Mississippi 3522           0   368372
## 32    AR             Arkansas 3484           0   366066
## 33    CT          Connecticut 3146           9   360662
## 34    IA                 Iowa 3224           0   343678
## 35    PR          Puerto Rico 3305           0   336026
## 36    KS               Kansas 3282           0   300158
## 37    NV               Nevada 3327           0   292844
## 38    WV        West Virginia 3033           3   253132
## 39    UT                 Utah 3474           0   216546
## 40    NM           New Mexico 3170           0   207657
## 41    NE             Nebraska 2878           0   190649
## 42    ME                Maine 2676           0   162212
## 43    ID                Idaho 2796           0   154882
## 44    NH        New Hampshire 2700           0   136571
## 45    HI               Hawaii 2588           0   125932
## 46    RI         Rhode Island 2504           0   115701
## 47    MT              Montana 2584           0   111052
## 48    DE             Delaware 2590           0   100672
## 49    SD         South Dakota 2478           0    87630
## 50    ND         North Dakota 2459           0    75823
## 51    DC District of Columbia 2571           0    64792
## 52    VT              Vermont 2362           0    62663
## 53    WY              Wyoming 2348           0    47960
## 54    AK               Alaska 2379           2    42735
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## Rows: 176,119
## Columns: 11
## $ Jurisdiction <chr> "Alabama", "Alabama", "Alabama", "Alabama", "Alabama",...
## $ weekEnding   <date> 2015-01-10, 2015-01-17, 2015-01-24, 2015-01-31, 2015-...
## $ state        <chr> "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", ...
## $ year         <fct> 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015, ...
## $ week         <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,...
## $ age          <fct> 25-44 years, 25-44 years, 25-44 years, 25-44 years, 25...
## $ deaths       <dbl> 67, 49, 55, 59, 47, 59, 41, 47, 59, 57, 54, 50, 58, 42...
## $ period       <fct> 2015-2019, 2015-2019, 2015-2019, 2015-2019, 2015-2019,...
## $ type         <chr> "Predicted (weighted)", "Predicted (weighted)", "Predi...
## $ Suppress     <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## $ Note         <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## Rows: 88,081
## Columns: 11
## $ Jurisdiction <chr> "Alabama", "Alabama", "Alabama", "Alabama", "Alabama",...
## $ weekEnding   <date> 2015-01-10, 2015-01-17, 2015-01-24, 2015-01-31, 2015-...
## $ state        <chr> "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", ...
## $ year         <fct> 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015, ...
## $ week         <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,...
## $ age          <fct> 25-44 years, 25-44 years, 25-44 years, 25-44 years, 25...
## $ deaths       <dbl> 67, 49, 55, 59, 47, 59, 41, 47, 59, 57, 54, 50, 58, 42...
## $ period       <fct> 2015-2019, 2015-2019, 2015-2019, 2015-2019, 2015-2019,...
## $ type         <chr> "Predicted (weighted)", "Predicted (weighted)", "Predi...
## $ Suppress     <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## $ Note         <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## 
## 
##  *** Data suppression checks *** 
## # A tibble: 13 x 11
##    Jurisdiction weekEnding state year   week age   deaths period type  Suppress
##    <chr>        <date>     <chr> <fct> <int> <fct>  <dbl> <fct>  <chr> <chr>   
##  1 Alaska       2020-09-26 AK    2020     39 85 y~     NA 2020   Pred~ Suppres~
##  2 Connecticut  2020-09-12 CT    2020     37 45-6~     NA 2020   Pred~ Suppres~
##  3 Connecticut  2020-09-12 CT    2020     37 65-7~     NA 2020   Pred~ Suppres~
##  4 Connecticut  2020-09-12 CT    2020     37 75-8~     NA 2020   Pred~ Suppres~
##  5 Connecticut  2020-09-19 CT    2020     38 75-8~     NA 2020   Pred~ Suppres~
##  6 Connecticut  2020-09-12 CT    2020     37 85 y~     NA 2020   Pred~ Suppres~
##  7 North Carol~ 2020-08-29 NC    2020     35 45-6~     NA 2020   Pred~ Suppres~
##  8 North Carol~ 2020-08-29 NC    2020     35 65-7~     NA 2020   Pred~ Suppres~
##  9 North Carol~ 2020-08-29 NC    2020     35 75-8~     NA 2020   Pred~ Suppres~
## 10 North Carol~ 2020-08-29 NC    2020     35 85 y~     NA 2020   Pred~ Suppres~
## 11 West Virgin~ 2020-09-26 WV    2020     39 65-7~     NA 2020   Pred~ Suppres~
## 12 West Virgin~ 2020-09-26 WV    2020     39 75-8~     NA 2020   Pred~ Suppres~
## 13 West Virgin~ 2020-09-26 WV    2020     39 85 y~     NA 2020   Pred~ Suppres~
## # ... with 1 more variable: Note <chr>
## 
##  *** Data suppression checks failed - total of 13 suppressions
##  *** Of these suppressions, 13 are NOT from weekThru of current year
## Continuing since all states with problems are in stateNoCheck
## `summarise()` regrouping output by 'Jurisdiction', 'weekEnding', 'state', 'year', 'week', 'age', 'period', 'type' (override with `.groups` argument)
## Rows: 82,841
## Columns: 12
## $ Jurisdiction <chr> "Alabama", "Alabama", "Alabama", "Alabama", "Alabama",...
## $ weekEnding   <date> 2015-01-10, 2015-01-10, 2015-01-10, 2015-01-10, 2015-...
## $ state        <chr> "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", ...
## $ year         <fct> 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015, ...
## $ week         <int> 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, ...
## $ age          <fct> Under 25 years, 25-44 years, 45-64 years, 65-74 years,...
## $ period       <fct> 2015-2019, 2015-2019, 2015-2019, 2015-2019, 2015-2019,...
## $ type         <chr> "Predicted (weighted)", "Predicted (weighted)", "Predi...
## $ Suppress     <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## $ n            <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
## $ deaths       <dbl> 25, 67, 253, 202, 272, 320, 28, 49, 256, 222, 253, 332...
## $ Note         <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## 
## First duplicate is in row number (0 means no duplicates): 0Rows: 179,663
## Columns: 11
## $ Jurisdiction         <chr> "Alabama", "Alabama", "Alabama", "Alabama", "A...
## $ `Week Ending Date`   <chr> "01/10/2015", "01/17/2015", "01/24/2015", "01/...
## $ `State Abbreviation` <chr> "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL"...
## $ Year                 <int> 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015...
## $ Week                 <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,...
## $ `Age Group`          <chr> "25-44 years", "25-44 years", "25-44 years", "...
## $ `Number of Deaths`   <dbl> 67, 49, 55, 59, 47, 59, 41, 47, 59, 57, 54, 50...
## $ `Time Period`        <chr> "2015-2019", "2015-2019", "2015-2019", "2015-2...
## $ Type                 <chr> "Predicted (weighted)", "Predicted (weighted)"...
## $ Suppress             <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## $ Note                 <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## Rows: 179,663
## Columns: 11
## $ Jurisdiction <chr> "Alabama", "Alabama", "Alabama", "Alabama", "Alabama",...
## $ weekEnding   <date> 2015-01-10, 2015-01-17, 2015-01-24, 2015-01-31, 2015-...
## $ state        <chr> "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", ...
## $ year         <int> 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015, ...
## $ week         <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,...
## $ age          <chr> "25-44 years", "25-44 years", "25-44 years", "25-44 ye...
## $ deaths       <dbl> 67, 49, 55, 59, 47, 59, 41, 47, 59, 57, 54, 50, 58, 42...
## $ period       <chr> "2015-2019", "2015-2019", "2015-2019", "2015-2019", "2...
## $ type         <chr> "Predicted (weighted)", "Predicted (weighted)", "Predi...
## $ Suppress     <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## $ Note         <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## 
## Check Control Levels and Record Counts for Renamed Data:
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 6 x 4
##   age                    n n_deaths_na   deaths
##   <chr>              <int>       <int>    <dbl>
## 1 25-44 years        26907           4  3293947
## 2 45-64 years        32860          15 12885563
## 3 65-74 years        32851          16 12792037
## 4 75-84 years        32870          19 15899092
## 5 85 years and older 32856          19 20718264
## 6 Under 25 years     21319           2  1416502
## `summarise()` regrouping output by 'period', 'year' (override with `.groups` argument)
## # A tibble: 12 x 6
## # Groups:   period, year [6]
##    period     year type                     n n_deaths_na  deaths
##    <chr>     <int> <chr>                <int>       <int>   <dbl>
##  1 2015-2019  2015 Predicted (weighted) 15285           0 5416391
##  2 2015-2019  2015 Unweighted           15285           0 5416391
##  3 2015-2019  2016 Predicted (weighted) 15365           0 5483764
##  4 2015-2019  2016 Unweighted           15365           0 5483764
##  5 2015-2019  2017 Predicted (weighted) 15318           0 5643347
##  6 2015-2019  2017 Unweighted           15318           0 5643347
##  7 2015-2019  2018 Predicted (weighted) 15307           0 5698022
##  8 2015-2019  2018 Unweighted           15307           0 5698022
##  9 2015-2019  2019 Predicted (weighted) 15318           0 5725502
## 10 2015-2019  2019 Unweighted           15318           0 5725502
## 11 2020       2020 Predicted (weighted) 13260          41 5584252
## 12 2020       2020 Unweighted           13217          34 5487101
## `summarise()` regrouping output by 'period' (override with `.groups` argument)
## # A tibble: 3 x 5
## # Groups:   period [2]
##   period    Suppress                                       n n_deaths_na  deaths
##   <chr>     <chr>                                      <int>       <int>   <dbl>
## 1 2015-2019 <NA>                                      153186           0  5.59e7
## 2 2020      Suppressed (counts highly incomplete, <5~     75          75  0.    
## 3 2020      <NA>                                       26402           0  1.11e7
## `summarise()` regrouping output by 'period' (override with `.groups` argument)
## # A tibble: 9 x 5
## # Groups:   period [2]
##   period   Note                                            n n_deaths_na  deaths
##   <chr>    <chr>                                       <int>       <int>   <dbl>
## 1 2015-20~ <NA>                                       153186           0  5.59e7
## 2 2020     Data in recent weeks are incomplete. Only~  21043          34  9.13e6
## 3 2020     Data in recent weeks are incomplete. Only~    444           0  2.04e5
## 4 2020     Data in recent weeks are incomplete. Only~    339          22  4.45e4
## 5 2020     Data in recent weeks are incomplete. Only~   2241          19  7.10e5
## 6 2020     Data in recent weeks are incomplete. Only~     48           0  2.62e4
## 7 2020     Estimates for Pennsylvania are too low fo~     48           0  2.26e4
## 8 2020     Weights may be too low to account for und~    312           0  1.16e5
## 9 2020     <NA>                                         2002           0  8.22e5
## `summarise()` regrouping output by 'state' (override with `.groups` argument)
##    state         Jurisdiction    n n_deaths_na   deaths
## 1     US        United States 3660           0 33382812
## 2     CA           California 3660           0  3175134
## 3     FL              Florida 3660           0  2439960
## 4     TX                Texas 3660           0  2405990
## 5     PA         Pennsylvania 3660           0  1600457
## 6     OH                 Ohio 3660           0  1443190
## 7     IL             Illinois 3660           0  1260836
## 8     NY             New York 3660           0  1191716
## 9     MI             Michigan 3660           0  1148050
## 10    NC       North Carolina 3573          33  1068292
## 11    GA              Georgia 3659           0  1002322
## 12    NJ           New Jersey 3654           0   892684
## 13    TN            Tennessee 3660           0   872960
## 14    VA             Virginia 3660           0   802114
## 15    IN              Indiana 3658           0   776857
## 16    MO             Missouri 3656           0   755398
## 17    AZ              Arizona 3660           0   707602
## 18    MA        Massachusetts 3624           0   705559
## 19    YC        New York City 3656           0   689474
## 20    WA           Washington 3658          10   664276
## 21    AL              Alabama 3658           0   620268
## 22    WI            Wisconsin 3640           0   614607
## 23    MD             Maryland 3654           0   589675
## 24    SC       South Carolina 3658           0   582138
## 25    KY             Kentucky 3621           0   564915
## 26    LA            Louisiana 3655           0   545500
## 27    MN            Minnesota 3612           0   521445
## 28    CO             Colorado 3657           0   462126
## 29    OK             Oklahoma 3649           0   461666
## 30    OR               Oregon 3490           0   427650
## 31    MS          Mississippi 3595           0   376968
## 32    AR             Arkansas 3552           0   375477
## 33    CT          Connecticut 3215          17   367731
## 34    IA                 Iowa 3293           0   352522
## 35    PR          Puerto Rico 3372           0   343055
## 36    KS               Kansas 3348           0   307450
## 37    NV               Nevada 3394           0   299546
## 38    WV        West Virginia 3098          11   258798
## 39    UT                 Utah 3546           0   221678
## 40    NM           New Mexico 3231           0   212208
## 41    NE             Nebraska 2942           0   195453
## 42    ME                Maine 2732           0   165642
## 43    ID                Idaho 2856           0   158780
## 44    NH        New Hampshire 2751           0   139437
## 45    HI               Hawaii 2642           0   128673
## 46    RI         Rhode Island 2553           0   117919
## 47    MT              Montana 2638           0   114177
## 48    DE             Delaware 2645           0   102858
## 49    SD         South Dakota 2522           4    89560
## 50    ND         North Dakota 2512           0    78477
## 51    DC District of Columbia 2630           0    66270
## 52    VT              Vermont 2416           0    64000
## 53    WY              Wyoming 2395           0    49201
## 54    AK               Alaska 2433           0    43852
## `summarise()` regrouping output by 'year' (override with `.groups` argument)

## Rows: 179,663
## Columns: 11
## $ Jurisdiction <chr> "Alabama", "Alabama", "Alabama", "Alabama", "Alabama",...
## $ weekEnding   <date> 2015-01-10, 2015-01-17, 2015-01-24, 2015-01-31, 2015-...
## $ state        <chr> "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", ...
## $ year         <fct> 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015, ...
## $ week         <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,...
## $ age          <fct> 25-44 years, 25-44 years, 25-44 years, 25-44 years, 25...
## $ deaths       <dbl> 67, 49, 55, 59, 47, 59, 41, 47, 59, 57, 54, 50, 58, 42...
## $ period       <fct> 2015-2019, 2015-2019, 2015-2019, 2015-2019, 2015-2019,...
## $ type         <chr> "Predicted (weighted)", "Predicted (weighted)", "Predi...
## $ Suppress     <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## $ Note         <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## Rows: 89,853
## Columns: 11
## $ Jurisdiction <chr> "Alabama", "Alabama", "Alabama", "Alabama", "Alabama",...
## $ weekEnding   <date> 2015-01-10, 2015-01-17, 2015-01-24, 2015-01-31, 2015-...
## $ state        <chr> "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", ...
## $ year         <fct> 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015, ...
## $ week         <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,...
## $ age          <fct> 25-44 years, 25-44 years, 25-44 years, 25-44 years, 25...
## $ deaths       <dbl> 67, 49, 55, 59, 47, 59, 41, 47, 59, 57, 54, 50, 58, 42...
## $ period       <fct> 2015-2019, 2015-2019, 2015-2019, 2015-2019, 2015-2019,...
## $ type         <chr> "Predicted (weighted)", "Predicted (weighted)", "Predi...
## $ Suppress     <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## $ Note         <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## 
## 
##  *** Data suppression checks *** 
## # A tibble: 41 x 11
##    Jurisdiction weekEnding state year   week age   deaths period type  Suppress
##    <chr>        <date>     <chr> <fct> <int> <fct>  <dbl> <fct>  <chr> <chr>   
##  1 Connecticut  2020-10-24 CT    2020     43 45-6~     NA 2020   Pred~ Suppres~
##  2 Connecticut  2020-10-31 CT    2020     44 45-6~     NA 2020   Pred~ Suppres~
##  3 Connecticut  2020-10-24 CT    2020     43 65-7~     NA 2020   Pred~ Suppres~
##  4 Connecticut  2020-10-31 CT    2020     44 65-7~     NA 2020   Pred~ Suppres~
##  5 Connecticut  2020-10-24 CT    2020     43 75-8~     NA 2020   Pred~ Suppres~
##  6 Connecticut  2020-10-31 CT    2020     44 75-8~     NA 2020   Pred~ Suppres~
##  7 Connecticut  2020-11-07 CT    2020     45 75-8~     NA 2020   Pred~ Suppres~
##  8 Connecticut  2020-10-24 CT    2020     43 85 y~     NA 2020   Pred~ Suppres~
##  9 Connecticut  2020-10-31 CT    2020     44 85 y~     NA 2020   Pred~ Suppres~
## 10 North Carol~ 2020-09-05 NC    2020     36 25-4~     NA 2020   Pred~ Suppres~
## # ... with 31 more rows, and 1 more variable: Note <chr>
## 
##  *** Data suppression checks failed - total of 41 suppressions
##  *** Of these suppressions, 41 are NOT from weekThru of current year
## Continuing since all states with problems are in stateNoCheck
## `summarise()` regrouping output by 'Jurisdiction', 'weekEnding', 'state', 'year', 'week', 'age', 'period', 'type' (override with `.groups` argument)

## Rows: 84,507
## Columns: 12
## $ Jurisdiction <chr> "Alabama", "Alabama", "Alabama", "Alabama", "Alabama",...
## $ weekEnding   <date> 2015-01-10, 2015-01-10, 2015-01-10, 2015-01-10, 2015-...
## $ state        <chr> "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", ...
## $ year         <fct> 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015, ...
## $ week         <int> 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, ...
## $ age          <fct> Under 25 years, 25-44 years, 45-64 years, 65-74 years,...
## $ period       <fct> 2015-2019, 2015-2019, 2015-2019, 2015-2019, 2015-2019,...
## $ type         <chr> "Predicted (weighted)", "Predicted (weighted)", "Predi...
## $ Suppress     <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## $ n            <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
## $ deaths       <dbl> 25, 67, 253, 202, 272, 320, 28, 49, 256, 222, 253, 332...
## $ Note         <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## 
## First duplicate is in row number (0 means no duplicates): 0
## Warning: Removed 6 row(s) containing missing values (geom_path).

str(cdcCompare_1014_1120)
## List of 3
##  $ cdc : tibble [305 x 3] (S3: tbl_df/tbl/data.frame)
##   ..$ weekEnding: Date[1:305], format: "2015-01-10" "2015-01-17" ...
##   ..$ deaths1   : num [1:305] 61684 61069 58605 57206 57205 ...
##   ..$ deaths2   : num [1:305] 61684 61062 58612 57206 57205 ...
##  $ cdc1: tibble [299 x 2] (S3: tbl_df/tbl/data.frame)
##   ..$ weekEnding: Date[1:299], format: "2015-01-10" "2015-01-17" ...
##   ..$ deaths    : num [1:299] 61684 61069 58605 57206 57205 ...
##  $ cdc2: tibble [305 x 2] (S3: tbl_df/tbl/data.frame)
##   ..$ weekEnding: Date[1:305], format: "2015-01-10" "2015-01-17" ...
##   ..$ deaths    : num [1:305] 61684 61062 58612 57206 57205 ...

A similar pattern is seen, with larger restatements in the more recent months of the older data file.

The function is updated to allow for taking a data frame or character path to a CSV. Direct use of the file is for example:

cdcCompare_1014_1206 <- cdcDeathCompare(loc1=cdcCompare_1014_1120$cdc1, 
                                        loc2=cdcCompare_0923_1206$cdc2, 
                                        returnList=FALSE
                                        )
## Warning: Removed 8 row(s) containing missing values (geom_path).

str(cdcCompare_1014_1206)
## tibble [307 x 3] (S3: tbl_df/tbl/data.frame)
##  $ weekEnding: Date[1:307], format: "2015-01-10" "2015-01-17" ...
##  $ deaths1   : num [1:307] 61684 61069 58605 57206 57205 ...
##  $ deaths2   : num [1:307] 61685 61061 58604 57214 57205 ...

Data are integrated from across four files:

cdcFourFiles <- cdcCompare_0923_1206$cdc %>%
    rename('20200923'=deaths1, '20201206'=deaths2) %>%
    full_join(rename(cdcCompare_1014_1120$cdc, '20201014'=deaths1, '20201120'=deaths2)) %>%
    pivot_longer(-weekEnding, names_to="source", values_to="reportedDeaths") %>%
    mutate(source=lubridate::ymd(source)) %>%
    arrange(weekEnding, source)
## Joining, by = "weekEnding"
cdcFourFiles
## # A tibble: 1,228 x 3
##    weekEnding source     reportedDeaths
##    <date>     <date>              <dbl>
##  1 2015-01-10 2020-09-23          61684
##  2 2015-01-10 2020-10-14          61684
##  3 2015-01-10 2020-11-20          61684
##  4 2015-01-10 2020-12-06          61685
##  5 2015-01-17 2020-09-23          61061
##  6 2015-01-17 2020-10-14          61069
##  7 2015-01-17 2020-11-20          61062
##  8 2015-01-17 2020-12-06          61061
##  9 2015-01-24 2020-09-23          58613
## 10 2015-01-24 2020-10-14          58605
## # ... with 1,218 more rows
# Function to plot restatement levels for a file
plotCDCRestatement <- function(df, 
                               keyVar="reportedDeaths", 
                               keyYears=c(2020), 
                               firstWeek=NULL
                               ) {
    
    # FUNCTION ARGUMENTS
    # df: a frame containing weekEnding-source-reportedDeaths
    # keyVar: character string, defaults to "reportedDeaths"
    # keyYears: years to be included
    # firstWeek: character vector for description of first week (NULL means calculate from data)
    
    dfNoNA <- df %>%
        rename("numKey"=all_of(keyVar)) %>%
        group_by(weekEnding) %>%
        summarize(numNA=sum(is.na(numKey)), .groups="drop") %>%
        filter(numNA==0) %>%
        select(weekEnding) %>%
        inner_join(df, by="weekEnding") %>%
        rename("numKey"=keyVar)
    
    if (is.null(firstWeek)) firstWeek <- dfNoNA %>% pull(source) %>% min() %>% as.character()
    
    dfRatio <- dfNoNA %>%
        arrange(weekEnding, source) %>%
        group_by(weekEnding) %>%
        mutate(firstNum=first(numKey), pctFirst=numKey/firstNum) %>%
        ungroup()
    
    p1 <- dfRatio %>%
        filter(lubridate::year(weekEnding) %in% keyYears) %>%
        ggplot(aes(x=weekEnding, y=pctFirst)) + 
        geom_line(aes(group=source, color=factor(source))) + 
        labs(x="Data reported for week", 
             y=paste0("Ratio vs. reported in ", firstWeek, " file"), 
             title="Ratio of CDC all-cause deaths reported by week", 
             subtitle=paste0("Compared to number by week reported in ", firstWeek, " file")
             ) + 
        scale_color_discrete("Download Date")
    print(p1)
    
    p2 <- dfRatio %>%
        filter(lubridate::year(weekEnding) %in% keyYears) %>%
        mutate(weeksPrior=as.integer(max(weekEnding)-weekEnding)/7) %>%
        group_by(weekEnding) %>%
        filter(row_number()==n()) %>%
        ggplot(aes(x=weeksPrior, y=pctFirst)) + 
        geom_text(aes(label=paste0(round(100*pctFirst), "%")), size=3) + 
        labs(x=paste0("Weeks prior to latest week reported in ", firstWeek, " file"), 
             y=paste0("Ratio vs. reported in ", firstWeek, " file"), 
             title="Ratio of CDC all-cause deaths reported by week", 
             subtitle=paste0("Compared to number by week reported in ", firstWeek, " file")
             )
    print(p2)
    
}

plotCDCRestatement(cdcFourFiles)
## Note: Using an external vector in selections is ambiguous.
## i Use `all_of(keyVar)` instead of `keyVar` to silence this message.
## i See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This message is displayed once per session.

It appears that projected deaths in the 23-SEP-2020 file were mostly finalized prior to the most recent four weeks. Inflation factors of ~125%, ~110%, ~105%, and ~105% would have been needed for the current week and the three preceding weeks to match the ‘final’ totals for those weeks as reported in the 2020-DEC-06 file.

The plotting routine has been converted to functional form and re-run. A comparison of percentages is then made using the data from 2020-10-14 as the baseline:

cdcFourFiles %>%
    filter(source >= as.Date("2020-10-14")) %>%
    plotCDCRestatement()

And the process is repeated, but excluding the 2020-12-06 file:

cdcFourFiles %>%
    filter(source < as.Date("2020-12-06")) %>%
    plotCDCRestatement()

There is some directional evidence that the extra lag may be less in the 2020-10-14 file than in the 2020-09-23 file. Further exploration of this topic could be interesting.

The latest CDC all-cause death data are downloaded:

# Download new data
cdcLoc <- "Weekly_counts_of_deaths_by_jurisdiction_and_age_group_downloaded_20201213.csv"
cdcList_20201213 <- readRunCDCAllCause(loc=cdcLoc, 
                                       startYear=2015, 
                                       curYear=2020,
                                       weekThru=40, 
                                       startWeek=9, 
                                       lst=test_hier5_201130, 
                                       epiMap=readFromRDS("epiMonth"), 
                                       agePopData=readFromRDS("usPopBucket2020"), 
                                       cvDeathThru="2020-10-03", 
                                       cdcPlotStartWeek=10, 
                                       dlData=!file.exists(paste0("./RInputFiles/Coronavirus/", cdcLoc)), 
                                       stateNoCheck=c("NC")
                                       )
## Rows: 181,400
## Columns: 11
## $ Jurisdiction         <chr> "Alabama", "Alabama", "Alabama", "Alabama", "A...
## $ `Week Ending Date`   <chr> "01/10/2015", "01/17/2015", "01/24/2015", "01/...
## $ `State Abbreviation` <chr> "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL"...
## $ Year                 <int> 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015...
## $ Week                 <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,...
## $ `Age Group`          <chr> "25-44 years", "25-44 years", "25-44 years", "...
## $ `Number of Deaths`   <dbl> 67, 49, 55, 59, 47, 59, 41, 47, 59, 57, 54, 50...
## $ `Time Period`        <chr> "2015-2019", "2015-2019", "2015-2019", "2015-2...
## $ Type                 <chr> "Predicted (weighted)", "Predicted (weighted)"...
## $ Suppress             <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## $ Note                 <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## Rows: 181,400
## Columns: 11
## $ Jurisdiction <chr> "Alabama", "Alabama", "Alabama", "Alabama", "Alabama",...
## $ weekEnding   <date> 2015-01-10, 2015-01-17, 2015-01-24, 2015-01-31, 2015-...
## $ state        <chr> "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", ...
## $ year         <int> 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015, ...
## $ week         <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,...
## $ age          <chr> "25-44 years", "25-44 years", "25-44 years", "25-44 ye...
## $ deaths       <dbl> 67, 49, 55, 59, 47, 59, 41, 47, 59, 57, 54, 50, 58, 42...
## $ period       <chr> "2015-2019", "2015-2019", "2015-2019", "2015-2019", "2...
## $ type         <chr> "Predicted (weighted)", "Predicted (weighted)", "Predi...
## $ Suppress     <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## $ Note         <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## 
## Check Control Levels and Record Counts for Renamed Data:
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 6 x 4
##   age                    n n_deaths_na   deaths
##   <chr>              <int>       <int>    <dbl>
## 1 25-44 years        27179           9  3331642
## 2 45-64 years        33177          18 13021484
## 3 65-74 years        33168          18 12947006
## 4 75-84 years        33188          20 16091745
## 5 85 years and older 33181          23 20957230
## 6 Under 25 years     21507           0  1429214
## `summarise()` regrouping output by 'period', 'year' (override with `.groups` argument)
## # A tibble: 12 x 6
## # Groups:   period, year [6]
##    period     year type                     n n_deaths_na  deaths
##    <chr>     <int> <chr>                <int>       <int>   <dbl>
##  1 2015-2019  2015 Predicted (weighted) 15285           0 5416393
##  2 2015-2019  2015 Unweighted           15285           0 5416393
##  3 2015-2019  2016 Predicted (weighted) 15365           0 5483764
##  4 2015-2019  2016 Unweighted           15365           0 5483764
##  5 2015-2019  2017 Predicted (weighted) 15316           0 5643329
##  6 2015-2019  2017 Unweighted           15316           0 5643329
##  7 2015-2019  2018 Predicted (weighted) 15306           0 5698014
##  8 2015-2019  2018 Unweighted           15306           0 5698014
##  9 2015-2019  2019 Predicted (weighted) 15318           0 5725533
## 10 2015-2019  2019 Unweighted           15318           0 5725533
## 11 2020       2020 Predicted (weighted) 14137          49 5976187
## 12 2020       2020 Unweighted           14083          39 5868068
## `summarise()` regrouping output by 'period' (override with `.groups` argument)
## # A tibble: 3 x 5
## # Groups:   period [2]
##   period    Suppress                                       n n_deaths_na  deaths
##   <chr>     <chr>                                      <int>       <int>   <dbl>
## 1 2015-2019 <NA>                                      153180           0  5.59e7
## 2 2020      Suppressed (counts highly incomplete, <5~     88          88  0.    
## 3 2020      <NA>                                       28132           0  1.18e7
## `summarise()` regrouping output by 'period' (override with `.groups` argument)
## # A tibble: 9 x 5
## # Groups:   period [2]
##   period   Note                                            n n_deaths_na  deaths
##   <chr>    <chr>                                       <int>       <int>   <dbl>
## 1 2015-20~ <NA>                                       153180           0  5.59e7
## 2 2020     Data in recent weeks are incomplete. Only~  23072           0  1.01e7
## 3 2020     Data in recent weeks are incomplete. Only~    468           0  2.18e5
## 4 2020     Data in recent weeks are incomplete. Only~    220          26  2.44e4
## 5 2020     Data in recent weeks are incomplete. Only~   2038          62  4.95e5
## 6 2020     Data in recent weeks are incomplete. Only~     60           0  3.06e4
## 7 2020     Estimates for Pennsylvania are too low fo~     48           0  2.26e4
## 8 2020     Weights may be too low to account for und~    134           0  4.01e4
## 9 2020     <NA>                                         2180           0  8.98e5
## `summarise()` regrouping output by 'state' (override with `.groups` argument)
##    state         Jurisdiction    n n_deaths_na   deaths
## 1     US        United States 3696           0 33768456
## 2     CA           California 3696           0  3207848
## 3     FL              Florida 3696           0  2465594
## 4     TX                Texas 3696           0  2434435
## 5     PA         Pennsylvania 3696           0  1619409
## 6     OH                 Ohio 3696           0  1461352
## 7     IL             Illinois 3696           0  1278684
## 8     NY             New York 3696           0  1204510
## 9     MI             Michigan 3696           0  1162511
## 10    NC       North Carolina 3600          48  1072568
## 11    GA              Georgia 3695           0  1012939
## 12    NJ           New Jersey 3689           0   902620
## 13    TN            Tennessee 3696           0   883980
## 14    VA             Virginia 3696           0   810957
## 15    IN              Indiana 3694           0   787707
## 16    MO             Missouri 3692           0   766234
## 17    AZ              Arizona 3696           0   715040
## 18    MA        Massachusetts 3658           0   712881
## 19    YC        New York City 3691           0   695737
## 20    WA           Washington 3695           0   673730
## 21    AL              Alabama 3695           0   627446
## 22    WI            Wisconsin 3676           0   624424
## 23    MD             Maryland 3690           0   596634
## 24    SC       South Carolina 3692           0   587910
## 25    KY             Kentucky 3654           0   571559
## 26    LA            Louisiana 3691           0   551128
## 27    MN            Minnesota 3647           0   529039
## 28    CO             Colorado 3694           0   468347
## 29    OK             Oklahoma 3682           0   466810
## 30    OR               Oregon 3523           0   432033
## 31    MS          Mississippi 3630           0   381529
## 32    AR             Arkansas 3589           0   380474
## 33    CT          Connecticut 3244          26   370676
## 34    IA                 Iowa 3324           0   358019
## 35    PR          Puerto Rico 3401           0   346043
## 36    KS               Kansas 3382           0   311724
## 37    NV               Nevada 3429           0   303343
## 38    WV        West Virginia 3119          14   260054
## 39    UT                 Utah 3581           0   224667
## 40    NM           New Mexico 3260           0   214877
## 41    NE             Nebraska 2968           0   198537
## 42    ME                Maine 2760           0   167532
## 43    ID                Idaho 2885           0   160947
## 44    NH        New Hampshire 2780           0   141077
## 45    HI               Hawaii 2668           0   129851
## 46    RI         Rhode Island 2577           0   119134
## 47    MT              Montana 2666           0   115998
## 48    DE             Delaware 2670           0   103965
## 49    SD         South Dakota 2551           0    91836
## 50    ND         North Dakota 2537           0    79575
## 51    DC District of Columbia 2652           0    66835
## 52    VT              Vermont 2434           0    64666
## 53    WY              Wyoming 2420           0    49956
## 54    AK               Alaska 2463           0    44484
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## Rows: 181,400
## Columns: 11
## $ Jurisdiction <chr> "Alabama", "Alabama", "Alabama", "Alabama", "Alabama",...
## $ weekEnding   <date> 2015-01-10, 2015-01-17, 2015-01-24, 2015-01-31, 2015-...
## $ state        <chr> "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", ...
## $ year         <fct> 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015, ...
## $ week         <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,...
## $ age          <fct> 25-44 years, 25-44 years, 25-44 years, 25-44 years, 25...
## $ deaths       <dbl> 67, 49, 55, 59, 47, 59, 41, 47, 59, 57, 54, 50, 58, 42...
## $ period       <fct> 2015-2019, 2015-2019, 2015-2019, 2015-2019, 2015-2019,...
## $ type         <chr> "Predicted (weighted)", "Predicted (weighted)", "Predi...
## $ Suppress     <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## $ Note         <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## Rows: 88,455
## Columns: 11
## $ Jurisdiction <chr> "Alabama", "Alabama", "Alabama", "Alabama", "Alabama",...
## $ weekEnding   <date> 2015-01-10, 2015-01-17, 2015-01-24, 2015-01-31, 2015-...
## $ state        <chr> "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", ...
## $ year         <fct> 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015, ...
## $ week         <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,...
## $ age          <fct> 25-44 years, 25-44 years, 25-44 years, 25-44 years, 25...
## $ deaths       <dbl> 67, 49, 55, 59, 47, 59, 41, 47, 59, 57, 54, 50, 58, 42...
## $ period       <fct> 2015-2019, 2015-2019, 2015-2019, 2015-2019, 2015-2019,...
## $ type         <chr> "Predicted (weighted)", "Predicted (weighted)", "Predi...
## $ Suppress     <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## $ Note         <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## 
## 
##  *** Data suppression checks *** 
## # A tibble: 19 x 11
##    Jurisdiction weekEnding state year   week age   deaths period type  Suppress
##    <chr>        <date>     <chr> <fct> <int> <fct>  <dbl> <fct>  <chr> <chr>   
##  1 North Carol~ 2020-09-12 NC    2020     37 25-4~     NA 2020   Pred~ Suppres~
##  2 North Carol~ 2020-09-19 NC    2020     38 25-4~     NA 2020   Pred~ Suppres~
##  3 North Carol~ 2020-09-26 NC    2020     39 25-4~     NA 2020   Pred~ Suppres~
##  4 North Carol~ 2020-09-12 NC    2020     37 45-6~     NA 2020   Pred~ Suppres~
##  5 North Carol~ 2020-09-19 NC    2020     38 45-6~     NA 2020   Pred~ Suppres~
##  6 North Carol~ 2020-09-26 NC    2020     39 45-6~     NA 2020   Pred~ Suppres~
##  7 North Carol~ 2020-10-03 NC    2020     40 45-6~     NA 2020   Pred~ Suppres~
##  8 North Carol~ 2020-09-12 NC    2020     37 65-7~     NA 2020   Pred~ Suppres~
##  9 North Carol~ 2020-09-19 NC    2020     38 65-7~     NA 2020   Pred~ Suppres~
## 10 North Carol~ 2020-09-26 NC    2020     39 65-7~     NA 2020   Pred~ Suppres~
## 11 North Carol~ 2020-10-03 NC    2020     40 65-7~     NA 2020   Pred~ Suppres~
## 12 North Carol~ 2020-09-12 NC    2020     37 75-8~     NA 2020   Pred~ Suppres~
## 13 North Carol~ 2020-09-19 NC    2020     38 75-8~     NA 2020   Pred~ Suppres~
## 14 North Carol~ 2020-09-26 NC    2020     39 75-8~     NA 2020   Pred~ Suppres~
## 15 North Carol~ 2020-10-03 NC    2020     40 75-8~     NA 2020   Pred~ Suppres~
## 16 North Carol~ 2020-09-12 NC    2020     37 85 y~     NA 2020   Pred~ Suppres~
## 17 North Carol~ 2020-09-19 NC    2020     38 85 y~     NA 2020   Pred~ Suppres~
## 18 North Carol~ 2020-09-26 NC    2020     39 85 y~     NA 2020   Pred~ Suppres~
## 19 North Carol~ 2020-10-03 NC    2020     40 85 y~     NA 2020   Pred~ Suppres~
## # ... with 1 more variable: Note <chr>
## 
##  *** Data suppression checks failed - total of 19 suppressions
##  *** Of these suppressions, 15 are NOT from weekThru of current year
## Continuing since all states with problems are in stateNoCheck
## `summarise()` regrouping output by 'Jurisdiction', 'weekEnding', 'state', 'year', 'week', 'age', 'period', 'type' (override with `.groups` argument)
## Rows: 83,195
## Columns: 12
## $ Jurisdiction <chr> "Alabama", "Alabama", "Alabama", "Alabama", "Alabama",...
## $ weekEnding   <date> 2015-01-10, 2015-01-10, 2015-01-10, 2015-01-10, 2015-...
## $ state        <chr> "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", ...
## $ year         <fct> 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015, ...
## $ week         <int> 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, ...
## $ age          <fct> Under 25 years, 25-44 years, 45-64 years, 65-74 years,...
## $ period       <fct> 2015-2019, 2015-2019, 2015-2019, 2015-2019, 2015-2019,...
## $ type         <chr> "Predicted (weighted)", "Predicted (weighted)", "Predi...
## $ Suppress     <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## $ n            <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
## $ deaths       <dbl> 25, 67, 253, 202, 272, 320, 28, 49, 256, 222, 253, 332...
## $ Note         <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## 
## First duplicate is in row number (0 means no duplicates): 0
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` regrouping output by 'cluster' (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)

## `summarise()` regrouping output by 'year' (override with `.groups` argument)

## `summarise()` regrouping output by 'year', 'week' (override with `.groups` argument)

## `summarise()` regrouping output by 'year', 'week' (override with `.groups` argument)

## `summarise()` regrouping output by 'year', 'age', 'week' (override with `.groups` argument)

## 
## Plots will be run after excluding stateNoCheck states
## `summarise()` regrouping output by 'year' (override with `.groups` argument)

## `summarise()` ungrouping output (override with `.groups` argument)

## `summarise()` regrouping output by 'year' (override with `.groups` argument)

## `summarise()` regrouping output by 'year' (override with `.groups` argument)

## `summarise()` regrouping output by 'year' (override with `.groups` argument)

## `summarise()` regrouping output by 'year' (override with `.groups` argument)

## `summarise()` regrouping output by 'year' (override with `.groups` argument)

## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'state', 'quarter', 'month' (override with `.groups` argument)
## `summarise()` regrouping output by 'state' (override with `.groups` argument)

## `summarise()` regrouping output by 'state' (override with `.groups` argument)

## `summarise()` ungrouping output (override with `.groups` argument)
## Joining, by = "state"

## `summarise()` regrouping output by 'year' (override with `.groups` argument)

## `summarise()` regrouping output by 'year' (override with `.groups` argument)

## `summarise()` regrouping output by 'year' (override with `.groups` argument)

## `summarise()` regrouping output by 'year' (override with `.groups` argument)

## `summarise()` regrouping output by 'year' (override with `.groups` argument)

## `summarise()` regrouping output by 'year' (override with `.groups` argument)

## `summarise()` regrouping output by 'age', 'quarter', 'month' (override with `.groups` argument)
## `summarise()` regrouping output by 'age' (override with `.groups` argument)

## `summarise()` ungrouping output (override with `.groups` argument)

saveToRDS(cdcList_20201213, ovrWriteError=FALSE)
## 
## File already exists: ./RInputFiles/Coronavirus/cdcList_20201213.RDS 
## 
## Not replacing the existing file since ovrWrite=FALSE
## NULL

New state-level data are downloaded, with existing segments used:

# Use existing segments with updated data
locDownload <- "./RInputFiles/Coronavirus/CV_downloaded_201214.csv"
old_hier5_201214 <- readRunCOVIDTrackingProject(thruLabel="Dec 13, 2020", 
                                                downloadTo=if(file.exists(locDownload)) NULL else locDownload,
                                                readFrom=locDownload, 
                                                compareFile=readFromRDS("test_hier5_201025")$dfRaw,
                                                useClusters=readFromRDS("test_hier5_201130")$useClusters
                                                )
## 
## -- Column specification --------------------------------------------------------
## cols(
##   .default = col_double(),
##   state = col_character(),
##   totalTestResultsSource = col_character(),
##   dataQualityGrade = col_character(),
##   lastUpdateEt = col_character(),
##   dateModified = col_datetime(format = ""),
##   checkTimeEt = col_character(),
##   dateChecked = col_datetime(format = ""),
##   fips = col_character(),
##   hash = col_character(),
##   grade = col_logical()
## )
## i Use `spec()` for the full column specifications.
## 
## File is unique by state and date
## 
## 
## Overall control totals in file:
## # A tibble: 1 x 3
##   positiveIncrease deathIncrease hospitalizedCurrently
##              <dbl>         <dbl>                 <dbl>
## 1         16339303        292404              12655613
## 
## *** COMPARISONS TO REFERENCE FILE: compareFile
## 
## Checkin for similarity of: column names
## In reference but not in current: 
## In current but not in reference: 
## 
## Checkin for similarity of: states
## In reference but not in current: 
## In current but not in reference: 
## 
## Checkin for similarity of: dates
## In reference but not in current: 
## In current but not in reference: 2020-12-14 2020-12-13 2020-12-12 2020-12-11 2020-12-10 2020-12-09 2020-12-08 2020-12-07 2020-12-06 2020-12-05 2020-12-04 2020-12-03 2020-12-02 2020-12-01 2020-11-30 2020-11-29 2020-11-28 2020-11-27 2020-11-26 2020-11-25 2020-11-24 2020-11-23 2020-11-22 2020-11-21 2020-11-20 2020-11-19 2020-11-18 2020-11-17 2020-11-16 2020-11-15 2020-11-14 2020-11-13 2020-11-12 2020-11-11 2020-11-10 2020-11-09 2020-11-08 2020-11-07 2020-11-06 2020-11-05 2020-11-04 2020-11-03 2020-11-02 2020-11-01 2020-10-31 2020-10-30 2020-10-29 2020-10-28 2020-10-27 2020-10-26 2020-10-25
## 
## *** Difference of at least 5 and difference is at least 1%:
## Joining, by = c("date", "name")
##           date                  name newValue oldValue
## 1   2020-03-05      positiveIncrease       86      103
## 2   2020-03-06      positiveIncrease      128      109
## 3   2020-03-07      positiveIncrease      129      176
## 4   2020-03-10      positiveIncrease      441      387
## 5   2020-03-11      positiveIncrease      497      509
## 6   2020-03-12      positiveIncrease      745      671
## 7   2020-03-13      positiveIncrease      933     1072
## 8   2020-03-14      positiveIncrease      970      924
## 9   2020-03-15      positiveIncrease     1217     1291
## 10  2020-03-16      positiveIncrease     1847     1739
## 11  2020-03-17      positiveIncrease     2249     2588
## 12  2020-03-18      positiveIncrease     3364     3089
## 13  2020-03-21 hospitalizedCurrently     1492     1436
## 14  2020-03-23 hospitalizedCurrently     2812     2770
## 15  2020-03-24      positiveIncrease    11116    10769
## 16  2020-03-25      positiveIncrease    12590    12908
## 17  2020-03-25 hospitalizedCurrently     5140     5062
## 18  2020-03-28      positiveIncrease    19602    19925
## 19  2020-03-28         deathIncrease      551      544
## 20  2020-03-29         deathIncrease      504      515
## 21  2020-03-30      positiveIncrease    21467    22042
## 22  2020-03-31      positiveIncrease    25187    24853
## 23  2020-03-31         deathIncrease      907      890
## 24  2020-04-01      positiveIncrease    26115    25791
## 25  2020-04-06      positiveIncrease    28425    29002
## 26  2020-04-09      positiveIncrease    35090    34503
## 27  2020-04-10      positiveIncrease    33489    34380
## 28  2020-04-10         deathIncrease     2072     2108
## 29  2020-04-11      positiveIncrease    31105    30501
## 30  2020-04-11         deathIncrease     2079     2054
## 31  2020-04-13      positiveIncrease    24398    25195
## 32  2020-04-14      positiveIncrease    26078    25719
## 33  2020-04-15      positiveIncrease    29859    30307
## 34  2020-04-16      positiveIncrease    31577    30978
## 35  2020-04-23         deathIncrease     1814     1791
## 36  2020-04-24         deathIncrease     1972     1895
## 37  2020-04-25         deathIncrease     1627     1748
## 38  2020-04-27         deathIncrease     1287     1270
## 39  2020-04-29         deathIncrease     2685     2713
## 40  2020-05-01         deathIncrease     1808     1779
## 41  2020-05-02         deathIncrease     1531     1562
## 42  2020-05-05         deathIncrease     2494     2452
## 43  2020-05-06         deathIncrease     1916     1948
## 44  2020-05-07      positiveIncrease    27227    27537
## 45  2020-05-12      positiveIncrease    22558    22890
## 46  2020-05-12         deathIncrease     1506     1486
## 47  2020-05-13      positiveIncrease    21628    21285
## 48  2020-05-13         deathIncrease     1734     1704
## 49  2020-05-14         deathIncrease     1852     1879
## 50  2020-05-15      positiveIncrease    25422    24685
## 51  2020-05-15         deathIncrease     1535     1507
## 52  2020-05-16      positiveIncrease    23586    24702
## 53  2020-05-16         deathIncrease     1237      987
## 54  2020-05-17         deathIncrease      873      849
## 55  2020-05-21         deathIncrease     1377     1394
## 56  2020-05-22      positiveIncrease    24173    24433
## 57  2020-05-22         deathIncrease     1291     1341
## 58  2020-05-23      positiveIncrease    22365    21531
## 59  2020-05-23         deathIncrease     1038     1063
## 60  2020-05-24      positiveIncrease    18860    20072
## 61  2020-05-24         deathIncrease      689      680
## 62  2020-05-26         deathIncrease      665      645
## 63  2020-05-27         deathIncrease     1335     1321
## 64  2020-05-29         deathIncrease     1171     1184
## 65  2020-05-30      positiveIncrease    23437    23682
## 66  2020-06-01         deathIncrease      680      668
## 67  2020-06-02         deathIncrease      973      962
## 68  2020-06-03      positiveIncrease    20155    20390
## 69  2020-06-03         deathIncrease      974      993
## 70  2020-06-04      positiveIncrease    20383    20886
## 71  2020-06-04         deathIncrease      881      893
## 72  2020-06-05      positiveIncrease    23066    23394
## 73  2020-06-05         deathIncrease      837      826
## 74  2020-06-06      positiveIncrease    22558    23064
## 75  2020-06-06         deathIncrease      714      728
## 76  2020-06-08         deathIncrease      674      661
## 77  2020-06-09         deathIncrease      891      902
## 78  2020-06-12      positiveIncrease    23096    23597
## 79  2020-06-12         deathIncrease      766      775
## 80  2020-06-15         deathIncrease      387      381
## 81  2020-06-16         deathIncrease      718      730
## 82  2020-06-17         deathIncrease      779      767
## 83  2020-06-18      positiveIncrease    27089    27746
## 84  2020-06-18         deathIncrease      685      705
## 85  2020-06-19      positiveIncrease    30959    31471
## 86  2020-06-20      positiveIncrease    31951    32294
## 87  2020-06-20         deathIncrease      615      629
## 88  2020-06-21      positiveIncrease    28848    27928
## 89  2020-06-23      positiveIncrease    33885    33447
## 90  2020-06-23         deathIncrease      722      710
## 91  2020-06-24         deathIncrease      707      724
## 92  2020-06-26         deathIncrease      621      637
## 93  2020-06-27         deathIncrease      503      511
## 94  2020-06-29         deathIncrease      338      332
## 95  2020-06-30         deathIncrease      580      596
## 96  2020-07-02      positiveIncrease    53508    54085
## 97  2020-07-04         deathIncrease      300      306
## 98  2020-07-06      positiveIncrease    41494    41959
## 99  2020-07-06         deathIncrease      235      243
## 100 2020-07-07         deathIncrease      910      923
## 101 2020-07-08         deathIncrease      818      807
## 102 2020-07-10         deathIncrease      835      854
## 103 2020-07-14      positiveIncrease    59250    62687
## 104 2020-07-14         deathIncrease      745      736
## 105 2020-07-15      positiveIncrease    69101    65797
## 106 2020-07-20         deathIncrease      375      363
## 107 2020-07-22         deathIncrease     1142     1171
## 108 2020-07-23         deathIncrease     1074     1056
## 109 2020-07-25         deathIncrease     1009     1023
## 110 2020-07-26      positiveIncrease    60123    61000
## 111 2020-07-30         deathIncrease     1245     1259
## 112 2020-07-31         deathIncrease     1329     1312
## 113 2020-08-01      positiveIncrease    60245    61101
## 114 2020-08-02      positiveIncrease    52737    46812
## 115 2020-08-03      positiveIncrease    43122    49713
## 116 2020-08-06         deathIncrease     1237     1251
## 117 2020-08-08      positiveIncrease    53083    53712
## 118 2020-08-14      positiveIncrease    57093    55636
## 119 2020-08-16      positiveIncrease    41782    42487
## 120 2020-08-20         deathIncrease     1122     1134
## 121 2020-08-22      positiveIncrease    45722    46236
## 122 2020-08-24      positiveIncrease    34249    34643
## 123 2020-08-29      positiveIncrease    43962    44501
## 124 2020-08-31         deathIncrease      377      366
## 125 2020-09-02      positiveIncrease    30216    30603
## 126 2020-09-07      positiveIncrease    28142    28682
## 127 2020-09-08         deathIncrease      350      358
## 128 2020-09-10         deathIncrease     1156     1170
## 129 2020-09-12         deathIncrease      821      810
## 130 2020-09-15      positiveIncrease    34945    35445
## 131 2020-09-16         deathIncrease     1184     1200
## 132 2020-09-17         deathIncrease      878      863
## 133 2020-09-19      positiveIncrease    44906    45564
## 134 2020-09-19         deathIncrease      751      740
## 135 2020-09-20      positiveIncrease    35504    36295
## 136 2020-09-24         deathIncrease      933      921
## 137 2020-09-27      positiveIncrease    34983    35454
## 138 2020-09-28      positiveIncrease    35362    36524
## 139 2020-09-28         deathIncrease      245      257
## 140 2020-10-02         deathIncrease      844      835
## 141 2020-10-04      positiveIncrease    37982    38439
## 142 2020-10-04         deathIncrease      373      363
## 143 2020-10-06         deathIncrease      622      634
## 144 2020-10-09         deathIncrease      905      893
## 145 2020-10-10         deathIncrease      690      665
## 146 2020-10-11      positiveIncrease    46268    46946
## 147 2020-10-12      positiveIncrease    42645    43124
## 148 2020-10-13         deathIncrease      724      690
## 149 2020-10-14         deathIncrease      798      811
## 150 2020-10-15         deathIncrease      928      951
## 151 2020-10-16         deathIncrease      894      877
## 152 2020-10-17      positiveIncrease    57355    57943
## 153 2020-10-18      positiveIncrease    48280    48922
## 154 2020-10-18         deathIncrease      402      393
## 155 2020-10-19         deathIncrease      451      456
## 156 2020-10-21      positiveIncrease    60980    58606
## 157 2020-10-22      positiveIncrease    73003    75248
## 158 2020-10-22         deathIncrease     1126     1143
## 159 2020-10-23         deathIncrease      941      916
## 160 2020-10-24         deathIncrease      897      885
## Joining, by = c("date", "name")
## Warning: Removed 51 row(s) containing missing values (geom_path).
## 
## 
## *** Difference of at least 5 and difference is at least 1%:
## Joining, by = c("state", "name")
##    state                  name newValue oldValue
## 1     AK      positiveIncrease    12523    13535
## 2     CO      positiveIncrease    93398    91570
## 3     CO         deathIncrease     2218     2076
## 4     FL      positiveIncrease   766305   776249
## 5     ND         deathIncrease      453      345
## 6     NM      positiveIncrease    41040    40168
## 7     NM hospitalizedCurrently    27399    27120
## 8     PR      positiveIncrease    31067    61275
## 9     RI      positiveIncrease    30581    30116
## 10    WA hospitalizedCurrently    92643    69716
## Rows: 16,082
## Columns: 55
## $ date                        <date> 2020-12-14, 2020-12-14, 2020-12-14, 20...
## $ state                       <chr> "AK", "AL", "AR", "AS", "AZ", "CA", "CO...
## $ positive                    <dbl> 40160, 297895, 187507, 0, 420248, 15850...
## $ probableCases               <dbl> NA, 53133, 26701, NA, 15954, NA, 11826,...
## $ negative                    <dbl> 1107400, 1478907, 1705843, 2140, 212692...
## $ pending                     <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
## $ totalTestResultsSource      <chr> "totalTestsViral", "totalTestsPeopleVir...
## $ totalTestResults            <dbl> 1147560, 1723669, 1866199, 2140, 434469...
## $ hospitalizedCurrently       <dbl> 140, 2286, 1050, NA, 3677, 14578, 1585,...
## $ hospitalizedCumulative      <dbl> 889, 28913, 9991, NA, 31142, NA, 16174,...
## $ inIcuCurrently              <dbl> NA, NA, 372, NA, 829, 3078, NA, NA, 63,...
## $ inIcuCumulative             <dbl> NA, 2363, NA, NA, NA, NA, NA, NA, NA, N...
## $ onVentilatorCurrently       <dbl> 12, NA, 180, NA, 542, NA, NA, NA, 34, N...
## $ onVentilatorCumulative      <dbl> NA, 1353, 1095, NA, NA, NA, NA, NA, NA,...
## $ recovered                   <dbl> 7165, 174805, 163351, NA, 62118, NA, 15...
## $ dataQualityGrade            <chr> "A", "A", "A+", "D", "A+", "B", "A", "C...
## $ lastUpdateEt                <chr> "12/14/2020 03:59", "12/14/2020 11:00",...
## $ dateModified                <dttm> 2020-12-14 03:59:00, 2020-12-14 11:00:...
## $ checkTimeEt                 <chr> "12/13 22:59", "12/14 06:00", "12/13 19...
## $ death                       <dbl> 176, 4102, 2990, 0, 7358, 21046, 3969, ...
## $ hospitalized                <dbl> 889, 28913, 9991, NA, 31142, NA, 16174,...
## $ dateChecked                 <dttm> 2020-12-14 03:59:00, 2020-12-14 11:00:...
## $ totalTestsViral             <dbl> 1147560, NA, 1866199, 2140, 4344693, 27...
## $ positiveTestsViral          <dbl> 47796, NA, NA, NA, NA, NA, NA, NA, NA, ...
## $ negativeTestsViral          <dbl> 1098479, NA, 1705843, NA, NA, NA, NA, N...
## $ positiveCasesViral          <dbl> NA, 244762, 160356, 0, 404294, 1585044,...
## $ deathConfirmed              <dbl> 176, 3624, 2656, NA, 6782, NA, 3398, 43...
## $ deathProbable               <dbl> NA, 478, 334, NA, 576, NA, 571, 1047, N...
## $ totalTestEncountersViral    <dbl> NA, NA, NA, NA, NA, NA, 3809889, NA, 78...
## $ totalTestsPeopleViral       <dbl> NA, 1723669, NA, NA, 2531222, NA, 19688...
## $ totalTestsAntibody          <dbl> NA, NA, NA, NA, 377958, NA, 235882, NA,...
## $ positiveTestsAntibody       <dbl> NA, NA, NA, NA, NA, NA, 22560, NA, NA, ...
## $ negativeTestsAntibody       <dbl> NA, NA, NA, NA, NA, NA, 212548, NA, NA,...
## $ totalTestsPeopleAntibody    <dbl> NA, 78262, NA, NA, NA, NA, NA, NA, NA, ...
## $ positiveTestsPeopleAntibody <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
## $ negativeTestsPeopleAntibody <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
## $ totalTestsPeopleAntigen     <dbl> NA, NA, 179158, NA, NA, NA, NA, NA, NA,...
## $ positiveTestsPeopleAntigen  <dbl> NA, NA, 32616, NA, NA, NA, NA, NA, NA, ...
## $ totalTestsAntigen           <dbl> NA, NA, 21856, NA, NA, NA, NA, 49816, N...
## $ positiveTestsAntigen        <dbl> NA, NA, 3300, NA, NA, NA, NA, NA, NA, N...
## $ fips                        <chr> "02", "01", "05", "60", "04", "06", "08...
## $ positiveIncrease            <dbl> 422, 2264, 1805, 0, 11806, 33278, 2911,...
## $ negativeIncrease            <dbl> 4040, 20347, 9262, 0, 7223, 323174, 103...
## $ total                       <dbl> 1147560, 1776802, 1893350, 2140, 254717...
## $ totalTestResultsIncrease    <dbl> 4462, 27230, 10495, 0, 38805, 356452, 3...
## $ posNeg                      <dbl> 1147560, 1776802, 1893350, 2140, 254717...
## $ deathIncrease               <dbl> 0, 0, 45, 0, 1, 77, 11, 81, 1, 0, 138, ...
## $ hospitalizedIncrease        <dbl> 3, 767, 64, 0, 193, 0, 48, 0, 0, 0, 142...
## $ hash                        <chr> "58baf833f72d7115b62a7e4dd1ab3545263286...
## $ commercialScore             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ negativeRegularScore        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ negativeScore               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ positiveScore               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ score                       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ grade                       <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
## 
## 
## Control totals - note that validState other than TRUE will be discarded
## 
## # A tibble: 2 x 6
##   validState    cases deaths  hosp     tests     n
##   <lgl>         <dbl>  <dbl> <dbl>     <dbl> <dbl>
## 1 FALSE         72033   1426    NA    510886  1370
## 2 TRUE       16267270 290978    NA 220582479 14712
## Rows: 14,712
## Columns: 6
## $ date   <date> 2020-12-14, 2020-12-14, 2020-12-14, 2020-12-14, 2020-12-14,...
## $ state  <chr> "AK", "AL", "AR", "AZ", "CA", "CO", "CT", "DC", "DE", "FL", ...
## $ cases  <dbl> 422, 2264, 1805, 11806, 33278, 2911, 7231, 164, 997, 8343, 3...
## $ deaths <dbl> 0, 0, 45, 1, 77, 11, 81, 1, 0, 138, 28, 0, 60, 6, 116, 35, 3...
## $ hosp   <dbl> 140, 2286, 1050, 3677, 14578, 1585, 1243, 239, 373, 4932, 33...
## $ tests  <dbl> 4462, 27230, 10495, 38805, 356452, 36588, 119244, 4714, 9931...
## Rows: 14,712
## Columns: 14
## $ date   <date> 2020-01-22, 2020-01-22, 2020-01-23, 2020-01-23, 2020-01-24,...
## $ state  <chr> "MA", "WA", "MA", "WA", "MA", "WA", "MA", "WA", "MA", "WA", ...
## $ cases  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ deaths <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ hosp   <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
## $ tests  <dbl> 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, ...
## $ cpm    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ dpm    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ hpm    <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
## $ tpm    <dbl> 0.0000000, 0.0000000, 0.1471796, 0.0000000, 0.0000000, 0.000...
## $ cpm7   <dbl> NA, NA, NA, NA, NA, NA, 0, 0, 0, 0, 0, 0, 0, 0, NA, 0, 0, NA...
## $ dpm7   <dbl> NA, NA, NA, NA, NA, NA, 0, 0, 0, 0, 0, 0, 0, 0, NA, 0, 0, NA...
## $ hpm7   <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
## $ tpm7   <dbl> NA, NA, NA, NA, NA, NA, 0.04205130, 0.00000000, 0.06307695, ...
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` regrouping output by 'date', 'cluster' (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)

## 
## Recency is defined as 2020-11-15 through current
## 
## Recency is defined as 2020-11-15 through current

## Warning: Removed 4 row(s) containing missing values (geom_path).

## Warning: Removed 4 row(s) containing missing values (geom_path).
## `summarise()` regrouping output by 'state', 'cluster', 'date' (override with `.groups` argument)

## `summarise()` ungrouping output (override with `.groups` argument)

## `summarise()` ungrouping output (override with `.groups` argument)

## `summarise()` ungrouping output (override with `.groups` argument)

saveToRDS(old_hier5_201214, ovrWriteError=FALSE)
## 
## File already exists: ./RInputFiles/Coronavirus/old_hier5_201214.RDS 
## 
## Not replacing the existing file since ovrWrite=FALSE
## NULL

New county-level data are downloaded, with existing segments used:

# Locations for the population, case, and death file
popLoc <- "./RInputFiles/Coronavirus/covid_county_population_usafacts.csv"
caseLoc <- "./RInputFiles/Coronavirus/covid_confirmed_usafacts_downloaded_20201215.csv"
deathLoc <- "./RInputFiles/Coronavirus/covid_deaths_usafacts_downloaded_20201215.csv"

# Run old segments against new data
cty_old_20201215 <- readRunUSAFacts(maxDate="2020-12-13", 
                                    popLoc=popLoc, 
                                    caseLoc=caseLoc, 
                                    deathLoc=deathLoc, 
                                    dlCaseDeath=!(file.exists(caseLoc) & file.exists(deathLoc)),
                                    oldFile=readFromRDS("cty_20201026")$dfBurden, 
                                    existingCountyClusters=readFromRDS("cty_new_20201203")$clustVec
                                    )
## 
## -- Column specification --------------------------------------------------------
## cols(
##   countyFIPS = col_double(),
##   `County Name` = col_character(),
##   State = col_character(),
##   population = col_double()
## )
## 
## -- Column specification --------------------------------------------------------
## cols(
##   .default = col_double(),
##   `County Name` = col_character(),
##   State = col_character()
## )
## i Use `spec()` for the full column specifications.
## Rows: 1,044,765
## Columns: 6
## $ countyFIPS <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ countyName <chr> "Statewide Unallocated", "Statewide Unallocated", "State...
## $ state      <chr> "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", "A...
## $ stateFIPS  <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...
## $ date       <date> 2020-01-22, 2020-01-23, 2020-01-24, 2020-01-25, 2020-01...
## $ cumCases   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## Warning: `expand_scale()` is deprecated; use `expansion()` instead.
## Warning: Missing column names filled in: 'X332' [332]
## 
## -- Column specification --------------------------------------------------------
## cols(
##   .default = col_double(),
##   `County Name` = col_character(),
##   State = col_character(),
##   X332 = col_logical()
## )
## i Use `spec()` for the full column specifications.
## Warning: 1 parsing failure.
##  row  col           expected actual                                                                      file
## 3196 X332 1/0/T/F/TRUE/FALSE        './RInputFiles/Coronavirus/covid_deaths_usafacts_downloaded_20201215.csv'
## Warning: Problem with `mutate()` input `date`.
## i  3196 failed to parse.
## i Input `date` is `lubridate::mdy(date)`.
## Warning: 3196 failed to parse.
## Rows: 1,048,288
## Columns: 6
## $ countyFIPS <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ countyName <chr> "Statewide Unallocated", "Statewide Unallocated", "State...
## $ state      <chr> "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", "A...
## $ stateFIPS  <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...
## $ date       <date> 2020-01-22, 2020-01-23, 2020-01-24, 2020-01-25, 2020-01...
## $ cumDeaths  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## Warning: `expand_scale()` is deprecated; use `expansion()` instead.
## `geom_smooth()` using formula 'y ~ x'
## `summarise()` ungrouping output (override with `.groups` argument)

## `summarise()` ungrouping output (override with `.groups` argument)
## 
## Shapes will be created without any floor on the number of cases per million
## Shapes will be created without any floor on the number of deaths per million
## *** Counties with 0 cases/deaths or that fall below the floor for minCase/minDeath ***
## # A tibble: 1 x 4
##   cpm_mean_is0 dpm_mean_is0 dpm_mean_ltDeath cpm_mean_ltCase
##          <dbl>        <dbl>            <dbl>           <dbl>
## 1            0      0.00251                0               0
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` regrouping output by 'cluster' (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` regrouping output by 'date', 'cluster' (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)

## 
## Recency is defined as 2020-11-14 through current
## 
## Recency is defined as 2020-11-14 through current
## `summarise()` regrouping output by 'cluster' (override with `.groups` argument)
## `summarise()` regrouping output by 'cluster' (override with `.groups` argument)
## `summarise()` regrouping output by 'cluster' (override with `.groups` argument)
## `summarise()` regrouping output by 'cluster' (override with `.groups` argument)

## Warning: `expand_scale()` is deprecated; use `expansion()` instead.

## Joining, by = "fipsCounty"
## Joining, by = "fipsCounty"
## `summarise()` regrouping output by 'cluster' (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)

saveToRDS(cty_old_20201215, ovrWriteError=FALSE)
## 
## File already exists: ./RInputFiles/Coronavirus/cty_old_20201215.RDS 
## 
## Not replacing the existing file since ovrWrite=FALSE
## NULL

Comparisons are made between the state-level totals as reflected in COVID Tracking Project and USA Facts:

usaFacts <- cty_old_20201215$clusterStateData %>% 
    filter(date <= as.Date("2020-12-10")) %>% 
    group_by(state) %>%
    summarize(cases=sum(cases, na.rm=TRUE), deaths=sum(deaths), .groups="drop")

ctp <- old_hier5_201214$plotData %>% 
    filter(date <= as.Date("2020-12-10")) %>% 
    group_by(state) %>% 
    summarize(cases=sum(cases), deaths=sum(deaths), .groups="drop")

usaCTP <- usaFacts %>%
    bind_rows(ctp, .id="source") %>%
    mutate(source=c('1'="USA Facts", '2'="COVID Tracking Project")[source]) %>%
    pivot_longer(-c(source, state), names_to="metric", values_to="value")

# Plot percentage difference by metric and state
usaCTP %>%
    pivot_wider(c(state, metric), names_from="source", values_from="value") %>%
    mutate(pct=`COVID Tracking Project`/`USA Facts`) %>%
    ggplot(aes(x=fct_reorder(state, abs(pct-1), .fun=max), y=pct)) + 
    geom_point() + 
    coord_flip() +
    labs(x="", 
         y="COVID Tracking Project as % of USA Facts", 
         title="Comparison of COVID Tracking Project and USA Facts", 
         subtitle="Data as of 10-DEC-2020"
         ) +
    geom_hline(yintercept=1, lty=2) +
    facet_wrap(~metric)

# Plot percentage difference by metric and state
usaCTP %>%
    pivot_wider(c(state, metric), names_from="source", values_from="value") %>%
    mutate(pct=`COVID Tracking Project`/`USA Facts`) %>%
    group_by(state) %>%
    filter(max(abs(pct-1))>=0.025) %>%
    ggplot(aes(x=fct_reorder(state, abs(pct-1), .fun=max), y=pct)) + 
    geom_text(aes(label=paste0(round(100*pct, 1), "%")), size=3) + 
    coord_flip() +
    labs(x="", 
         y="COVID Tracking Project as % of USA Facts", 
         title="Comparison of COVID Tracking Project and USA Facts", 
         subtitle="Data as of 10-DEC-2020 (filtered to only states at least 2.5% different on one metric)"
         ) +
    geom_hline(yintercept=1, lty=2) +
    facet_wrap(~metric)

It may be particularly interesting to explore some of the larger differences:

  • Cases - VA, IA, RI, TX
  • Deaths - VA, NY, IA, GA, KS

Virginia is particularly of note for being ~25% different on both cases and deaths. Iowa is particularly of note for being ~15% different on both cases and deaths, but in different directions.

A function is written to explore the daily data for a give state:

exploreState <- function(keyState, 
                         ctp, 
                         usa, 
                         maxDate=NULL, 
                         minDate=NULL
                         ) {
    
    # FUNCtION ARGUMENTS:
    # keyState: the state abbreviation of interest
    # ctp: a processed list from COVID Tracking Project
    # usa: a processed list from USA Facts
    # maxDate: the latest date to use (NULL means use all)
    # minDate: the minimum date to use (NULL means use all)
    
    # Extract the relevant data from COVID Tracking Project
    ctp <- ctp[["plotData"]] %>% 
        filter(state %in% keyState) %>% 
        group_by(date) %>% 
        summarize(cases=sum(cases), deaths=sum(deaths), .groups="drop")
    
    # Extract the relevant data from USA Facts
    usa <- usa[["clusterStateData"]] %>% 
        filter(state %in% keyState) %>% 
        group_by(date) %>%
        summarize(cases=sum(cases, na.rm=TRUE), deaths=sum(deaths), .groups="drop")
    
    # Integrate the data
    ctpUSA <- mutate(ctp, source="COVID Tracking Project") %>%
        bind_rows(mutate(usa, source="USA Facts"))
    
    # Filter for minDate and maxDate if requested
    if (!is.null(minDate)) ctpUSA <- filter(ctpUSA, date >= as.Date(minDate))
    if (!is.null(maxDate)) ctpUSA <- filter(ctpUSA, date >= as.Date(maxDate))

    # Create a pivoted data frame
    pivotDF <- ctpUSA %>%
        pivot_longer(-c(date, source), names_to="metric", values_to="value") %>%
        arrange(source, metric, date) %>%
        group_by(source, metric) %>%
        mutate(cumValue=cumsum(value))
    
    # Plot the data by date
    p1 <- pivotDF %>%
        ggplot(aes(x=date, y=value)) + 
        geom_line(aes(group=source, color=source)) + 
        facet_wrap(~metric, scales="free_y") + 
        labs(x="", y="Impact", title=paste0("Impact by date for: ", keyState)) + 
        scale_color_discrete("Source")
    print(p1)
    
    # Plot cumulative data by date
    p2 <- pivotDF %>%
        ggplot(aes(x=date, y=cumValue)) + 
        geom_line(aes(group=source, color=source)) + 
        facet_wrap(~metric, scales="free_y") + 
        labs(x="", y="Cumulative Impact", title=paste0("Cumulative impact by date for: ", keyState)) + 
        scale_color_discrete("Source")
    print(p2)
    
    # Return the processed file
    pivotDF
    
}


# Test the function for NY
exploreState("NY", ctp=old_hier5_201214, usa=cty_old_20201215)

## # A tibble: 1,230 x 5
## # Groups:   source, metric [4]
##    date       source                 metric value cumValue
##    <date>     <chr>                  <chr>  <dbl>    <dbl>
##  1 2020-03-02 COVID Tracking Project cases      0        0
##  2 2020-03-03 COVID Tracking Project cases      1        1
##  3 2020-03-04 COVID Tracking Project cases      0        1
##  4 2020-03-05 COVID Tracking Project cases      2        3
##  5 2020-03-06 COVID Tracking Project cases     22       25
##  6 2020-03-07 COVID Tracking Project cases     11       36
##  7 2020-03-08 COVID Tracking Project cases     24       60
##  8 2020-03-09 COVID Tracking Project cases     28       88
##  9 2020-03-10 COVID Tracking Project cases     63      151
## 10 2020-03-11 COVID Tracking Project cases     44      195
## # ... with 1,220 more rows

The data sources agree almost perfectly on cases, but there are some significant differences on deaths by day leading to a total of ~25k in COVID Tracking Project and ~35k in USA Facts. Reported data are closer to ~35k so there may be some further digging needed for the COVID Tracking Project process.

The function is also run for VA and IA as they are the two other largest outliers:

exploreState("VA", ctp=old_hier5_201214, usa=cty_old_20201215)
## Warning: Removed 1 row(s) containing missing values (geom_path).

## Warning: Removed 1 row(s) containing missing values (geom_path).

## # A tibble: 1,240 x 5
## # Groups:   source, metric [4]
##    date       source                 metric value cumValue
##    <date>     <chr>                  <chr>  <dbl>    <dbl>
##  1 2020-02-27 COVID Tracking Project cases      0        0
##  2 2020-02-28 COVID Tracking Project cases      0        0
##  3 2020-02-29 COVID Tracking Project cases      0        0
##  4 2020-03-01 COVID Tracking Project cases      0        0
##  5 2020-03-02 COVID Tracking Project cases      0        0
##  6 2020-03-03 COVID Tracking Project cases      0        0
##  7 2020-03-04 COVID Tracking Project cases      0        0
##  8 2020-03-05 COVID Tracking Project cases      0        0
##  9 2020-03-06 COVID Tracking Project cases      0        0
## 10 2020-03-07 COVID Tracking Project cases      0        0
## # ... with 1,230 more rows
exploreState("IA", ctp=old_hier5_201214, usa=cty_old_20201215)

## # A tibble: 1,222 x 5
## # Groups:   source, metric [4]
##    date       source                 metric value cumValue
##    <date>     <chr>                  <chr>  <dbl>    <dbl>
##  1 2020-03-06 COVID Tracking Project cases      0        0
##  2 2020-03-07 COVID Tracking Project cases      0        0
##  3 2020-03-08 COVID Tracking Project cases      0        0
##  4 2020-03-09 COVID Tracking Project cases      3        3
##  5 2020-03-10 COVID Tracking Project cases      5        8
##  6 2020-03-11 COVID Tracking Project cases      5       13
##  7 2020-03-12 COVID Tracking Project cases      1       14
##  8 2020-03-13 COVID Tracking Project cases      2       16
##  9 2020-03-14 COVID Tracking Project cases      1       17
## 10 2020-03-15 COVID Tracking Project cases      1       18
## # ... with 1,212 more rows

With the Virginia data, USA Facts consistently reports lower than COVID Tracking Project, reaching ~3500 deaths with ~225k cases while COVID Tracking Project reports ~4500 deaths and ~275k cases. The COVID Tracking Project data re much closer to official numbers than the USA Facts data.

With the Iowa data, the disconnect in cases begins roughly in September, with USA Facts reporting ~250k cases total and COVID Tracking Project reporting ~225k cases total (both report ~3200 deaths). Official data for Iowa are closer to ~260k deaths and ~3300 deaths.

Further exploration of the raw data files may be merited to see if any data are being deleted (e.g., county name/FIPS mismatch or NYC tracked separately from NYS or the like).

The raw data files from COVID Tracking Project are explored:

# Raw data from COVID Tracking Project, summed
ctpRaw <- old_hier5_201214$dfRaw %>% 
    group_by(state) %>% 
    summarize_if(is.numeric, sum, na.rm=TRUE)

# States in ctpRaw that are not in c(state.abb, "DC)
# Includes only American Samoa, Guam, Mariana Islands, Puerto Rico, Virgin Islands
ctpRaw %>%
    filter(!(state %in% c(state.abb, "DC")))
## # A tibble: 5 x 45
##   state positive probableCases negative pending totalTestResults
##   <chr>    <dbl>         <dbl>    <dbl>   <dbl>            <dbl>
## 1 AS           0             0   266911     315           266911
## 2 GU      493309          6476  8100860       0          8594169
## 3 MP       13477             0  2689283     139          2702760
## 4 PR     4114221         50956 53405932   47728         57520153
## 5 VI      182437             0  2974272    9900          3156709
## # ... with 39 more variables: hospitalizedCurrently <dbl>,
## #   hospitalizedCumulative <dbl>, inIcuCurrently <dbl>, inIcuCumulative <dbl>,
## #   onVentilatorCurrently <dbl>, onVentilatorCumulative <dbl>, recovered <dbl>,
## #   death <dbl>, hospitalized <dbl>, totalTestsViral <dbl>,
## #   positiveTestsViral <dbl>, negativeTestsViral <dbl>,
## #   positiveCasesViral <dbl>, deathConfirmed <dbl>, deathProbable <dbl>,
## #   totalTestEncountersViral <dbl>, totalTestsPeopleViral <dbl>,
## #   totalTestsAntibody <dbl>, positiveTestsAntibody <dbl>,
## #   negativeTestsAntibody <dbl>, totalTestsPeopleAntibody <dbl>,
## #   positiveTestsPeopleAntibody <dbl>, negativeTestsPeopleAntibody <dbl>,
## #   totalTestsPeopleAntigen <dbl>, positiveTestsPeopleAntigen <dbl>,
## #   totalTestsAntigen <dbl>, positiveTestsAntigen <dbl>,
## #   positiveIncrease <dbl>, negativeIncrease <dbl>, total <dbl>,
## #   totalTestResultsIncrease <dbl>, posNeg <dbl>, deathIncrease <dbl>,
## #   hospitalizedIncrease <dbl>, commercialScore <dbl>,
## #   negativeRegularScore <dbl>, negativeScore <dbl>, positiveScore <dbl>,
## #   score <dbl>
# Metrics for New York
ctpRaw %>%
    filter(state %in% c("NY", "IA", "VA")) %>%
    column_to_rownames("state") %>%
    t()
##                                    IA         NY        VA
## positive                     17342936  110047057  27433393
## probableCases                       0          0   1895495
## negative                    114208570 1890862895 330080375
## pending                           105        520     93853
## totalTestResults            131551506 2000909952 358459244
## hospitalizedCurrently          110932    1005029    305204
## hospitalizedCumulative           1696   22097024   2102099
## inIcuCurrently                  28213     274027     72436
## inIcuCumulative                     0          0         0
## onVentilatorCurrently           14404      70476     36829
## onVentilatorCumulative              0          0         0
## recovered                    12612159   17800199   3198609
## death                          272184    6038024    588412
## hospitalized                     1696   22097024   2102099
## totalTestsViral              16298088          0         0
## positiveTestsViral            1895748          0  30965382
## negativeTestsViral           14220042          0         0
## positiveCasesViral           17342611  110047057  25433700
## deathConfirmed                      0          0    547794
## deathProbable                       0          0     37776
## totalTestEncountersViral            0 2000909952 358459244
## totalTestsPeopleViral       130352195          0         0
## totalTestsAntibody                  0          0         0
## positiveTestsAntibody               0          0         0
## negativeTestsAntibody               0          0         0
## totalTestsPeopleAntibody     10315397          0         0
## positiveTestsPeopleAntibody    671773          0         0
## negativeTestsPeopleAntibody   9636219          0         0
## totalTestsPeopleAntigen       7795638          0         0
## positiveTestsPeopleAntigen    1219168          0         0
## totalTestsAntigen             5213686          0         0
## positiveTestsAntigen           282757          0         0
## positiveIncrease               222785     784204    285149
## negativeIncrease               901974   21337935   3482447
## total                       131551611 2000910472 357607621
## totalTestResultsIncrease      1124759   22122139   3722568
## posNeg                      131551506 2000909952 357513768
## deathIncrease                    3273      27870      4414
## hospitalizedIncrease                0      89995     16073
## commercialScore                     0          0         0
## negativeRegularScore                0          0         0
## negativeScore                       0          0         0
## positiveScore                       0          0         0
## score                               0          0         0

There is no state-level surrogate missing (e.g., ‘YC’ is not being used to distinguish NYC from NYS). There are several variables that potentially relate to deaths:

  • death
  • deathConfirmed
  • deathProbable
  • deathIncrease

The data are filtered to just these columns, and the results displayed for easier viewing:

# Metrics for New York
ctpRaw %>%
    filter(state %in% c("NY", "IA", "VA")) %>%
    select(state, death, deathConfirmed, deathProbable, deathIncrease) %>%
    column_to_rownames("state") %>%
    t()
##                    IA      NY     VA
## death          272184 6038024 588412
## deathConfirmed      0       0 547794
## deathProbable       0       0  37776
## deathIncrease    3273   27870   4414

The variable ‘death’ is potentially of interest, as it appears to be an attempt at cumsum(deathIncrease). Is this observed in the data?

# Raw data from COVID Tracking Project, summed
ctpDaily <- old_hier5_201214$dfRaw %>% 
    arrange(state, date) %>%
    select(state, date, death, deathIncrease) %>%
    group_by(state) %>% 
    mutate(cumDeathIncrease=cumsum(ifelse(is.na(deathIncrease), 0, deathIncrease))) %>%
    ungroup()

ctpDaily %>% filter(death != cumDeathIncrease)
## # A tibble: 0 x 5
## # ... with 5 variables: state <chr>, date <date>, death <dbl>,
## #   deathIncrease <dbl>, cumDeathIncrease <dbl>

The data fields are aligned. According to COVID Tracking Project, their NYS data include a significant undercount of the LTC deaths, driven by reporting issues with NYS.

Further, COVID Tracking Project explain that “As of June 1, 2020, New York City reported 5740 more deaths (probable and confirmed) than New York State reports for NYC. Due to an exclusion of probable COVID-19 deaths in the state’s data, New York City and New York State report substantially diverging death counts. We use the data from the state, so our dataset excludes these probable COVID-19 deaths.”

While the disconnect is large, NYC is a very large component of NYS and LTC is a very large component of COVID death. This would likely explain very different numbers of deaths reported in NYS, even as there is close agreement on the number of cases.

The total difference in deaths and cases by state as of early December is explored:

usaCTP %>%
    filter(metric=="deaths") %>%
    pivot_wider(state, names_from="source", values_from="value") %>%
    mutate(ctpMinusUSA=`COVID Tracking Project`-`USA Facts`, 
           absDiff=abs(ctpMinusUSA), 
           signDiff=ctpMinusUSA>=0
           ) %>%
    ggplot(aes(x=fct_reorder(state, absDiff), y=absDiff)) + 
    geom_text(aes(y=absDiff+100, label=absDiff), hjust=0, size=3) +
    geom_col(aes(fill=signDiff)) + 
    scale_fill_discrete("CTP >= USAF") +
    coord_flip() + 
    labs(x="", 
         y="Absolute Value of Difference in Reported Deaths", 
         title="Difference in Reported Deaths (COVID Tracking Project vs USA Facts)"
         )

usaCTP %>%
    filter(metric=="cases") %>%
    pivot_wider(state, names_from="source", values_from="value") %>%
    mutate(ctpMinusUSA=`COVID Tracking Project`-`USA Facts`, 
           absDiff=abs(ctpMinusUSA), 
           signDiff=ctpMinusUSA>=0
           ) %>%
    ggplot(aes(x=fct_reorder(state, absDiff), y=absDiff)) + 
    geom_text(aes(y=absDiff+100, label=paste0(round(absDiff/1000, 1), "k")), hjust=0, size=3) +
    geom_col(aes(fill=signDiff)) + 
    scale_fill_discrete("CTP >= USAF") +
    coord_flip() + 
    labs(x="", 
         y="Absolute Value of Difference in Reported Cases", 
         title="Difference in Reported Cases (COVID Tracking Project vs USA Facts)"
         )

This highlights a few additional states of interest, including:

  • Deaths - IL
  • Cases - CA, GA, MI, TN, FL

It is notable that the data sources generally agree on the totals in most states, with large differences in either direction observed in some exception states.

The USA Facts data are explored for Virginia:

countyVA <- cty_old_20201215$burdenData %>% 
    filter(state=="VA") %>% 
    group_by(county, countyFIPS) %>% 
    summarize(pop=max(population, na.rm=TRUE), 
              cases=max(cumCases, na.rm=TRUE), 
              deaths=max(cumDeaths, na.rm=TRUE), 
              .groups="drop"
              ) 
countyVA %>% 
    select(-county, -countyFIPS) %>% 
    colSums()
##     pop   cases  deaths 
## 6802317  221912    3615

Viriginal should have a population closer to 8.5 million, so the processed USA Facts file appears to be missing about 20% of the state. The raw files are explored in more detail:

# Locations for the population, case, and death file
popLoc <- "./RInputFiles/Coronavirus/covid_county_population_usafacts.csv"
caseLoc <- "./RInputFiles/Coronavirus/covid_confirmed_usafacts_downloaded_20201215.csv"
deathLoc <- "./RInputFiles/Coronavirus/covid_deaths_usafacts_downloaded_20201215.csv"

rawCountyPop <- readr::read_csv(popLoc) 
## 
## -- Column specification --------------------------------------------------------
## cols(
##   countyFIPS = col_double(),
##   `County Name` = col_character(),
##   State = col_character(),
##   population = col_double()
## )
rawCountyPop %>%
    filter(State=="VA") %>%
    summarize(n=n(), population=sum(population))
## # A tibble: 1 x 2
##       n population
##   <int>      <dbl>
## 1   134    8535519

The raw population file has 8.5 million people across 134 counties, in contrast to the 6.8 million people across 120 counties in the final processed file. What causes the disconnect?

mismatchVA <- rawCountyPop %>%
    filter(State=="VA") %>%
    anti_join(select(countyVA, countyFIPS), by="countyFIPS")
mismatchVA
## # A tibble: 14 x 4
##    countyFIPS `County Name`         State population
##         <dbl> <chr>                 <chr>      <dbl>
##  1          0 Statewide Unallocated VA             0
##  2      51115 Matthews County       VA          8834
##  3      51510 Alexandria City       VA        159428
##  4      51540 Charlottesville City  VA         47266
##  5      51550 Chesapeake City       VA        244835
##  6      51590 Danville City         VA         40044
##  7      51630 Fredericksburg City   VA         29036
##  8      51660 Harrisonburg City     VA         53016
##  9      51683 Manassas City         VA         41085
## 10      51710 Norfolk City          VA        242742
## 11      51740 Portsmouth City       VA         94398
## 12      51760 Richmond City         VA        230436
## 13      51800 Suffolk City          VA         92108
## 14      51810 Virginia Beach City   VA        449974
mismatchVA %>%
    summarize(n=n(), population=sum(population))
## # A tibble: 1 x 2
##       n population
##   <int>      <dbl>
## 1    14    1733202

It appears that the processed data is missing the large cities in Virginia that report separately from the counties. Are these available in the deaths data?

rawCountyDeath <- readr::read_csv(deathLoc) 
## Warning: Missing column names filled in: 'X332' [332]
## 
## -- Column specification --------------------------------------------------------
## cols(
##   .default = col_double(),
##   `County Name` = col_character(),
##   State = col_character(),
##   X332 = col_logical()
## )
## i Use `spec()` for the full column specifications.
## Warning: 1 parsing failure.
##  row  col           expected actual                                                                      file
## 3196 X332 1/0/T/F/TRUE/FALSE        './RInputFiles/Coronavirus/covid_deaths_usafacts_downloaded_20201215.csv'
rawCountyDeath %>%
    filter(State=="VA")
## # A tibble: 134 x 332
##    countyFIPS `County Name` State stateFIPS `1/22/20` `1/23/20` `1/24/20`
##         <dbl> <chr>         <chr>     <dbl>     <dbl>     <dbl>     <dbl>
##  1          0 Statewide Un~ VA           51         0         0         0
##  2      51001 Accomack Cou~ VA           51         0         0         0
##  3      51003 Albemarle Co~ VA           51         0         0         0
##  4      51005 Alleghany Co~ VA           51         0         0         0
##  5      51007 Amelia County VA           51         0         0         0
##  6      51009 Amherst Coun~ VA           51         0         0         0
##  7      51011 Appomattox C~ VA           51         0         0         0
##  8      51013 Arlington Co~ VA           51         0         0         0
##  9      51015 Augusta Coun~ VA           51         0         0         0
## 10      51017 Bath County   VA           51         0         0         0
## # ... with 124 more rows, and 325 more variables: `1/25/20` <dbl>,
## #   `1/26/20` <dbl>, `1/27/20` <dbl>, `1/28/20` <dbl>, `1/29/20` <dbl>,
## #   `1/30/20` <dbl>, `1/31/20` <dbl>, `2/1/20` <dbl>, `2/2/20` <dbl>,
## #   `2/3/20` <dbl>, `2/4/20` <dbl>, `2/5/20` <dbl>, `2/6/20` <dbl>,
## #   `2/7/20` <dbl>, `2/8/20` <dbl>, `2/9/20` <dbl>, `2/10/20` <dbl>,
## #   `2/11/20` <dbl>, `2/12/20` <dbl>, `2/13/20` <dbl>, `2/14/20` <dbl>,
## #   `2/15/20` <dbl>, `2/16/20` <dbl>, `2/17/20` <dbl>, `2/18/20` <dbl>,
## #   `2/19/20` <dbl>, `2/20/20` <dbl>, `2/21/20` <dbl>, `2/22/20` <dbl>,
## #   `2/23/20` <dbl>, `2/24/20` <dbl>, `2/25/20` <dbl>, `2/26/20` <dbl>,
## #   `2/27/20` <dbl>, `2/28/20` <dbl>, `2/29/20` <dbl>, `3/1/20` <dbl>,
## #   `3/2/20` <dbl>, `3/3/20` <dbl>, `3/4/20` <dbl>, `3/5/20` <dbl>,
## #   `3/6/20` <dbl>, `3/7/20` <dbl>, `3/8/20` <dbl>, `3/9/20` <dbl>,
## #   `3/10/20` <dbl>, `3/11/20` <dbl>, `3/12/20` <dbl>, `3/13/20` <dbl>,
## #   `3/14/20` <dbl>, `3/15/20` <dbl>, `3/16/20` <dbl>, `3/17/20` <dbl>,
## #   `3/18/20` <dbl>, `3/19/20` <dbl>, `3/20/20` <dbl>, `3/21/20` <dbl>,
## #   `3/22/20` <dbl>, `3/23/20` <dbl>, `3/24/20` <dbl>, `3/25/20` <dbl>,
## #   `3/26/20` <dbl>, `3/27/20` <dbl>, `3/28/20` <dbl>, `3/29/20` <dbl>,
## #   `3/30/20` <dbl>, `3/31/20` <dbl>, `4/1/20` <dbl>, `4/2/20` <dbl>,
## #   `4/3/20` <dbl>, `4/4/20` <dbl>, `4/5/20` <dbl>, `4/6/20` <dbl>,
## #   `4/7/20` <dbl>, `4/8/20` <dbl>, `4/9/20` <dbl>, `4/10/20` <dbl>,
## #   `4/11/20` <dbl>, `4/12/20` <dbl>, `4/13/20` <dbl>, `4/14/20` <dbl>,
## #   `4/15/20` <dbl>, `4/16/20` <dbl>, `4/17/20` <dbl>, `4/18/20` <dbl>,
## #   `4/19/20` <dbl>, `4/20/20` <dbl>, `4/21/20` <dbl>, `4/22/20` <dbl>,
## #   `4/23/20` <dbl>, `4/24/20` <dbl>, `4/25/20` <dbl>, `4/26/20` <dbl>,
## #   `4/27/20` <dbl>, `4/28/20` <dbl>, `4/29/20` <dbl>, `4/30/20` <dbl>,
## #   `5/1/20` <dbl>, `5/2/20` <dbl>, `5/3/20` <dbl>, ...
vaCountyDeath <- rawCountyDeath %>%
    filter(State=="VA") %>%
    select(-X332) %>%
    pivot_longer(-c(countyFIPS, `County Name`, State, stateFIPS), names_to="date", values_to="deaths") %>%
    mutate(date=lubridate::mdy(date)) %>%
    filter(date==max(date))
vaCountyDeath
## # A tibble: 134 x 6
##    countyFIPS `County Name`         State stateFIPS date       deaths
##         <dbl> <chr>                 <chr>     <dbl> <date>      <dbl>
##  1          0 Statewide Unallocated VA           51 2020-12-13      0
##  2      51001 Accomack County       VA           51 2020-12-13     21
##  3      51003 Albemarle County      VA           51 2020-12-13     28
##  4      51005 Alleghany County      VA           51 2020-12-13     18
##  5      51007 Amelia County         VA           51 2020-12-13      7
##  6      51009 Amherst County        VA           51 2020-12-13      7
##  7      51011 Appomattox County     VA           51 2020-12-13      6
##  8      51013 Arlington County      VA           51 2020-12-13    164
##  9      51015 Augusta County        VA           51 2020-12-13     16
## 10      51017 Bath County           VA           51 2020-12-13      0
## # ... with 124 more rows
vaCountyDeath %>%
    rename(deathName=`County Name`) %>%
    inner_join(select(mismatchVA, popName=`County Name`, everything()), by=c("countyFIPS", "State"))
## # A tibble: 14 x 8
##    countyFIPS deathName   State stateFIPS date       deaths popName   population
##         <dbl> <chr>       <chr>     <dbl> <date>      <dbl> <chr>          <dbl>
##  1          0 Statewide ~ VA           51 2020-12-13      0 Statewid~          0
##  2      51115 Mathews Co~ VA           51 2020-12-13      1 Matthews~       8834
##  3      51510 Alexandria~ VA           51 2020-12-13     82 Alexandr~     159428
##  4      51540 Charlottes~ VA           51 2020-12-13     31 Charlott~      47266
##  5      51550 Chesapeake~ VA           51 2020-12-13     88 Chesapea~     244835
##  6      51590 Danville c~ VA           51 2020-12-13     55 Danville~      40044
##  7      51630 Fredericks~ VA           51 2020-12-13      6 Frederic~      29036
##  8      51660 Harrisonbu~ VA           51 2020-12-13     40 Harrison~      53016
##  9      51683 Manassas c~ VA           51 2020-12-13     28 Manassas~      41085
## 10      51710 Norfolk ci~ VA           51 2020-12-13     99 Norfolk ~     242742
## 11      51740 Portsmouth~ VA           51 2020-12-13     73 Portsmou~      94398
## 12      51760 Richmond c~ VA           51 2020-12-13     86 Richmond~     230436
## 13      51800 Suffolk ci~ VA           51 2020-12-13     83 Suffolk ~      92108
## 14      51810 Virginia B~ VA           51 2020-12-13    125 Virginia~     449974

Some county and city names are spelled differently in the population file and the deaths file, leading to a disconnect. The process for reading the USA Facts data should be updated to check and correct for this.

A check is made to see if this issue impacts any other states in the USA Facts data:

allCountyDeath <- rawCountyDeath %>%
    select(-X332) %>%
    pivot_longer(-c(countyFIPS, `County Name`, State, stateFIPS), names_to="date", values_to="deaths") %>%
    mutate(date=lubridate::mdy(date), 
           countyFIPS=stringr::str_pad(countyFIPS, side="left", width=5, pad="0")
           ) %>%
    filter(date==max(date))
allCountyDeath
## # A tibble: 3,196 x 6
##    countyFIPS `County Name`         State stateFIPS date       deaths
##    <chr>      <chr>                 <chr>     <dbl> <date>      <dbl>
##  1 00000      Statewide Unallocated AL            1 2020-12-13      0
##  2 01001      Autauga County        AL            1 2020-12-13     42
##  3 01003      Baldwin County        AL            1 2020-12-13    141
##  4 01005      Barbour County        AL            1 2020-12-13     30
##  5 01007      Bibb County           AL            1 2020-12-13     40
##  6 01009      Blount County         AL            1 2020-12-13     47
##  7 01011      Bullock County        AL            1 2020-12-13     20
##  8 01013      Butler County         AL            1 2020-12-13     44
##  9 01015      Calhoun County        AL            1 2020-12-13    129
## 10 01017      Chambers County       AL            1 2020-12-13     55
## # ... with 3,186 more rows
countyAll <- cty_old_20201215$burdenData %>% 
    group_by(county, countyFIPS) %>% 
    summarize(pop=max(population, na.rm=TRUE), 
              cases=max(cumCases, na.rm=TRUE), 
              deaths=max(cumDeaths, na.rm=TRUE), 
              .groups="drop"
              ) %>%
    mutate(countyFIPS=stringr::str_pad(countyFIPS, width=5, side="left", pad="0"))
countyAll
## # A tibble: 3,127 x 5
##    county             countyFIPS    pop cases deaths
##    <chr>              <chr>       <dbl> <dbl>  <dbl>
##  1 Abbeville (SC)     45001       24527  1108     23
##  2 Acadia Parish (LA) 22001       62045  4522    127
##  3 Accomack (VA)      51001       32316  1494     21
##  4 Ada (ID)           16001      481587 31227    285
##  5 Adair (IA)         19001        7152   509     16
##  6 Adair (KY)         21001       19202   979     34
##  7 Adair (MO)         29001       25343  1297      3
##  8 Adair (OK)         40001       22194  1482     13
##  9 Adams (CO)         08001      517421 35145    425
## 10 Adams (IA)         19003        3602   212      2
## # ... with 3,117 more rows
# Records in the processed deaths file but not in the raw file (should be empty)
countyAll %>%
    anti_join(allCountyDeath, by="countyFIPS")
## # A tibble: 0 x 5
## # ... with 5 variables: county <chr>, countyFIPS <chr>, pop <dbl>, cases <dbl>,
## #   deaths <dbl>
# Records in the raw deaths file but not in the processed deaths file
mismatch_1 <- allCountyDeath %>%
    anti_join(countyAll, by="countyFIPS")
mismatch_1 %>%
    filter(deaths > 0) %>%
    arrange(-deaths) %>%
    as.data.frame()
##    countyFIPS                        County Name State stateFIPS       date
## 1       00000              Statewide Unallocated    KS        20 2020-12-13
## 2       00000              Statewide Unallocated    GA        13 2020-12-13
## 3       00000              Statewide Unallocated    NY        36 2020-12-13
## 4       51810                Virginia Beach city    VA        51 2020-12-13
## 5       00000              Statewide Unallocated    MI        26 2020-12-13
## 6       51710                       Norfolk city    VA        51 2020-12-13
## 7       00000              Statewide Unallocated    TN        47 2020-12-13
## 8       51550                    Chesapeake city    VA        51 2020-12-13
## 9       51760                      Richmond city    VA        51 2020-12-13
## 10      51800                       Suffolk city    VA        51 2020-12-13
## 11      51510                    Alexandria city    VA        51 2020-12-13
## 12      51740                    Portsmouth city    VA        51 2020-12-13
## 13      00000              Statewide Unallocated    MD        24 2020-12-13
## 14      08014                  Broomfield County    CO         8 2020-12-13
## 15      51590                      Danville city    VA        51 2020-12-13
## 16      00001 New York City Unallocated/Probable    NY        36 2020-12-13
## 17      51660                  Harrisonburg city    VA        51 2020-12-13
## 18      51540               Charlottesville city    VA        51 2020-12-13
## 19      51683                      Manassas city    VA        51 2020-12-13
## 20      27073               Lac qui Parle County    MN        27 2020-12-13
## 21      00000              Statewide Unallocated    MA        25 2020-12-13
## 22      51630                Fredericksburg city    VA        51 2020-12-13
## 23      00000              Statewide Unallocated    RI        44 2020-12-13
## 24      00000              Statewide Unallocated    UT        49 2020-12-13
## 25      00000              Statewide Unallocated    WA        53 2020-12-13
## 26      00000              Statewide Unallocated    HI        15 2020-12-13
## 27      00000              Statewide Unallocated    MT        30 2020-12-13
## 28      00000              Statewide Unallocated    CT         9 2020-12-13
## 29      00000              Statewide Unallocated    NE        31 2020-12-13
## 30      51115                     Mathews County    VA        51 2020-12-13
##    deaths
## 1     254
## 2     229
## 3     137
## 4     125
## 5     116
## 6      99
## 7      97
## 8      88
## 9      86
## 10     83
## 11     82
## 12     73
## 13     67
## 14     57
## 15     55
## 16     45
## 17     40
## 18     31
## 19     28
## 20      7
## 21      6
## 22      6
## 23      5
## 24      5
## 25      3
## 26      2
## 27      2
## 28      1
## 29      1
## 30      1
mismatch_1 %>%
    mutate(unalloc=ifelse(stringr::str_detect(string=`County Name`, pattern="nallocated"), 
                          "unallocated", 
                          "specific"
                          )
           ) %>%
    filter(deaths > 0) %>%
    pivot_wider(id_cols=c(State), 
                names_from=unalloc, 
                values_from=deaths, 
                values_fn=sum, 
                values_fill=0
                ) %>%
    arrange(-specific, -unallocated)
## # A tibble: 17 x 3
##    State specific unallocated
##    <chr>    <dbl>       <dbl>
##  1 VA         797           0
##  2 CO          57           0
##  3 MN           7           0
##  4 KS           0         254
##  5 GA           0         229
##  6 NY           0         182
##  7 MI           0         116
##  8 TN           0          97
##  9 MD           0          67
## 10 MA           0           6
## 11 RI           0           5
## 12 UT           0           5
## 13 WA           0           3
## 14 HI           0           2
## 15 MT           0           2
## 16 CT           0           1
## 17 NE           0           1

There are an additional 57 deaths in Colorado (Broomfield County) and 7 deaths in Minnesota (Lac qui Parle County) that are not counted.

There are also statewide unallocated deaths of at least 50 in Kansas (254), Georgia (229), New York (182), Michigan (116), Tennessee (97), and Maryland (67).

Among the states where the absolute value of difference in deaths is at least 100:

  • New York (7340) - driven by differences in LTC tracking and probable cases in NYC as tracked by county (data from NYC) or by state (data from NYS), in an environment where a large city and state disagree on methodology
  • Georgia (1077) - about 20% explained by unallocated, remainder to be determined
  • Illinois (984) - to be explored
  • Virginia (792) - county name mismatches during merging
  • Iowa (444) - to be explored
  • Wisconsin (266) - to be explored?
  • Florida (260) - to be explored?
  • Washington (177) - to be explored?
  • Kansas (174) - unallocated explains gap
  • California (159) - to be explored?
  • Michigan (117) - unallocated explains gap

Next steps include updating the methodology for merging and exploring remaining disconnects for reported deaths in Georgia, Illinois, and Iowa.

The same process is run for cases:

rawCountyCases <- readr::read_csv(caseLoc) 
## 
## -- Column specification --------------------------------------------------------
## cols(
##   .default = col_double(),
##   `County Name` = col_character(),
##   State = col_character()
## )
## i Use `spec()` for the full column specifications.
allCountyCases <- rawCountyCases %>%
    pivot_longer(-c(countyFIPS, `County Name`, State, stateFIPS), names_to="date", values_to="cases") %>%
    mutate(date=lubridate::mdy(date), 
           countyFIPS=stringr::str_pad(countyFIPS, side="left", width=5, pad="0")
           ) %>%
    filter(date==max(date))
allCountyCases
## # A tibble: 3,195 x 6
##    countyFIPS `County Name`         State stateFIPS date       cases
##    <chr>      <chr>                 <chr>     <dbl> <date>     <dbl>
##  1 00000      Statewide Unallocated AL            1 2020-12-13     0
##  2 01001      Autauga County        AL            1 2020-12-13  3233
##  3 01003      Baldwin County        AL            1 2020-12-13 10489
##  4 01005      Barbour County        AL            1 2020-12-13  1264
##  5 01007      Bibb County           AL            1 2020-12-13  1398
##  6 01009      Blount County         AL            1 2020-12-13  3663
##  7 01011      Bullock County        AL            1 2020-12-13   723
##  8 01013      Butler County         AL            1 2020-12-13  1289
##  9 01015      Calhoun County        AL            1 2020-12-13  7658
## 10 01017      Chambers County       AL            1 2020-12-13  1982
## # ... with 3,185 more rows
# Records in the processed cases file but not in the raw file (should be empty)
countyAll %>%
    anti_join(allCountyCases, by="countyFIPS")
## # A tibble: 0 x 5
## # ... with 5 variables: county <chr>, countyFIPS <chr>, pop <dbl>, cases <dbl>,
## #   deaths <dbl>
# Records in the raw cases file but not in the processed cases file
mismatch_1_cases <- allCountyCases %>%
    anti_join(countyAll, by="countyFIPS")
mismatch_1_cases %>%
    filter(cases > 0) %>%
    arrange(-cases) %>%
    as.data.frame()
##    countyFIPS                County Name State stateFIPS       date cases
## 1       00000      Statewide Unallocated    GA        13 2020-12-13 21332
## 2       00000      Statewide Unallocated    TN        47 2020-12-13 19800
## 3       00000      Statewide Unallocated    MI        26 2020-12-13 19025
## 4       51810        Virginia Beach City    VA        51 2020-12-13 13063
## 5       51760              Richmond City    VA        51 2020-12-13  7636
## 6       51550            Chesapeake City    VA        51 2020-12-13  7540
## 7       51710               Norfolk City    VA        51 2020-12-13  7368
## 8       51510            Alexandria City    VA        51 2020-12-13  6215
## 9       51660          Harrisonburg City    VA        51 2020-12-13  3804
## 10      51740            Portsmouth City    VA        51 2020-12-13  3593
## 11      51800               Suffolk City    VA        51 2020-12-13  3211
## 12      00000      Statewide Unallocated    FL        12 2020-12-13  3130
## 13      00000      Statewide Unallocated    AR         5 2020-12-13  3096
## 14      00000      Statewide Unallocated    NM        35 2020-12-13  2995
## 15      00000      Statewide Unallocated    RI        44 2020-12-13  2939
## 16      51683              Manassas City    VA        51 2020-12-13  2572
## 17      08014 Broomfield County and City    CO         8 2020-12-13  2446
## 18      51540       Charlottesville City    VA        51 2020-12-13  2008
## 19      51590              Danville City    VA        51 2020-12-13  1966
## 20      00000      Statewide Unallocated    MA        25 2020-12-13  1754
## 21      00000      Statewide Unallocated    WA        53 2020-12-13  1394
## 22      00000      Statewide Unallocated    CT         9 2020-12-13  1055
## 23      00000      Statewide Unallocated    NJ        34 2020-12-13  1019
## 24      51630        Fredericksburg City    VA        51 2020-12-13   823
## 25      00000      Statewide Unallocated    UT        49 2020-12-13   795
## 26      00000      Statewide Unallocated    IA        19 2020-12-13   608
## 27      00000      Statewide Unallocated    NH        33 2020-12-13   570
## 28      27073       Lac Qui Parle County    MN        27 2020-12-13   519
## 29      00000      Statewide Unallocated    MN        27 2020-12-13   447
## 30      00000      Statewide Unallocated    LA        22 2020-12-13   416
## 31      00000      Statewide Unallocated    HI        15 2020-12-13   286
## 32      00000      Statewide Unallocated    IL        17 2020-12-13   247
## 33      51115            Matthews County    VA        51 2020-12-13   198
## 34      00000      Statewide Unallocated    OK        40 2020-12-13   114
## 35      00000      Statewide Unallocated    DE        10 2020-12-13    96
## 36      00000      Statewide Unallocated    NY        36 2020-12-13    69
## 37      00000      Statewide Unallocated    NC        37 2020-12-13    29
## 38      00000      Statewide Unallocated    CO         8 2020-12-13    25
## 39      00000      Statewide Unallocated    AK         2 2020-12-13    23
## 40      06000 Grand Princess Cruise Ship    CA         6 2020-12-13    21
## 41      00000      Statewide Unallocated    VT        50 2020-12-13     8
## 42      00000      Statewide Unallocated    ME        23 2020-12-13     6
## 43      00000      Statewide Unallocated    ID        16 2020-12-13     3
## 44      00000      Statewide Unallocated    NE        31 2020-12-13     2
## 45      00000      Statewide Unallocated    KY        21 2020-12-13     1
mismatch_1_cases %>%
    mutate(unalloc=ifelse(stringr::str_detect(string=`County Name`, pattern="nallocated"), 
                          "unallocated", 
                          "specific"
                          )
           ) %>%
    filter(cases > 0) %>%
    pivot_wider(id_cols=c(State), 
                names_from=unalloc, 
                values_from=cases, 
                values_fn=sum, 
                values_fill=0
                ) %>%
    arrange(-specific, -unallocated)
## # A tibble: 31 x 3
##    State unallocated specific
##    <chr>       <dbl>    <dbl>
##  1 VA              0    59997
##  2 CO             25     2446
##  3 MN            447      519
##  4 CA              0       21
##  5 GA          21332        0
##  6 TN          19800        0
##  7 MI          19025        0
##  8 FL           3130        0
##  9 AR           3096        0
## 10 NM           2995        0
## # ... with 21 more rows

The issues appear to be broadly the same, with significant county name mismatches in Virginia, one mismatch in each of Colorado and Minnesota, and a number of statewide unallocated cases. There are also 21 cases on a cruise ship in California; these will have no impact on the analysis.

An attempt is made to update the readUSAFacts() function to account for mismatches in county demographics:

# Function to read and convert raw data from USA Facts
readUSAFacts <- function(caseFile, 
                         deathFile, 
                         countyPopFile=filter(pop_usafacts, countyFIPS!=0),
                         oldFile=NULL,
                         showBurdenMinPop=NULL,
                         maxDate=NULL,
                         stateClusters=NULL, 
                         countyClusters=NULL, 
                         glimpseRaw=TRUE
                         ) {
    
    # FUNCTION ARGUMENTS:
    # caseFile: the location of the downloaded cases dataset
    # deathsFile: the location of the downloaded deaths dataset
    # countyopFile: the location of the county population file
    # oldFile: a file for comparing control totals against (NULL means do not compare)
    # showBurdenMinPop: minimum population for showing burden by cluster (NULL means skip plot)
    # maxDate: the date to use for the burden by cluster plot (ignored unless showBurdenMinPop is not NULL)
    # stateClusters: a field 'cluster' will be created from state using this named vector
    #                NULL means do not use state for finding clusters
    # countyClusters: a field 'cluster' will be created from countyFIPS using this named vector
    #                 NULL means do not use county for finding clusters
    # If both stateClusters and countyClusters are passed, only stateClusters will be used
    # glimpseRaw: boolean, whether to show a glimpse of the raw data when initially read in
    
    # Read cases file
    cnvCases <- helperReadConvert(caseFile, 
                                  valueName="cumCases", 
                                  glimpseRaw=glimpseRaw, 
                                  countyPopFile=countyPopFile
    )
    
    # Read deaths file
    cnvDeaths <- helperReadConvert(deathFile, 
                                   valueName="cumDeaths", 
                                   glimpseRaw=glimpseRaw, 
                                   countyPopFile=countyPopFile
    )

    # Check that county name and population is consistent by county FIPS in each file
    caseCheck <- cnvCases %>%
        group_by(countyFIPS, stateFIPS) %>%
        summarize(pop_mean_case=mean(population), 
                  pop_sd_case=sd(population), 
                  nNamesCase=n_distinct(county), 
                  nameCase=first(county), 
                  .groups="drop"
                  )
    deathCheck <- cnvDeaths %>%
        group_by(countyFIPS, stateFIPS) %>%
        summarize(pop_mean_death=mean(population), 
                  pop_sd_death=sd(population), 
                  nNamesDeath=n_distinct(county), 
                  nameDeath=first(county), 
                  .groups="drop"
                  )

    dcMismatch <- deathCheck %>%
        full_join(caseCheck, by=c("countyFIPS", "stateFIPS"))
    
    # Abort if any counties are in only one source
    nNA <- dcMismatch %>% is.na() %>% colSums() %>% sum()
    if (nNA > 0) {
        cat("\nDifferent combinations of countyFIPS-stateFIPS in cases and deaths data\n")
        dcMismatch %>% filter(!complete.cases(.)) %>% print()
        stop("\nFix and re-run\n")
    }
    
    # Abort if any counties have mismatched populations (within source or across source) or 2+ names per source
    badMismatch <- dcMismatch %>%
        filter(pop_mean_death != pop_mean_case | 
                   pop_sd_death != 0 | 
                   pop_sd_case !=0 | 
                   nNamesDeath != 1 | 
                   nNamesCase != 1
               )
    if (nrow(badMismatch) > 0) {
        cat("\nCase and death data have fatal mismatch on population and/or names\n")
        print(badMismatch)
        stop("\nFix and re-run\n")
    }

    # Flag is any counties have different names in the two file
    cat("\nCounty name mismatches (case name will be used)\n")
    dcMismatch %>%
        filter(nameCase != nameDeath) %>%
        select(countyFIPS, stateFIPS, nameCase, nameDeath, pop_mean_case) %>%
        print()
    
    # Flag any records in one file but not in the other
    cat("\nIn cases but not in deaths\n")
    cnvCases %>%
        anti_join(cnvDeaths, by=c("countyFIPS", "stateFIPS", "date")) %>%
        print()
    cat("\nIn deaths but not in cases\n")
    cnvDeaths %>%
        anti_join(cnvCases, by=c("countyFIPS", "stateFIPS", "date")) %>%
        print()
    
    # File origin is a single row by county with dates as columns, so inner_join should be OK
    # From cnvCases keep countyFIPS-stateFIPS-date-county-state-population-cumCases-cumCasesPer
    # From cnvDeaths keep countyFIPS-stateFIPS-date-cumDeaths-cumDeathsPer
    # Inner join using keys countyFIPS-stateFIPS-date
    # Also, add the state segments as 'cluster' if requested
    dfBurden <- cnvCases %>%
        select(countyFIPS, stateFIPS, date, county, state, population, cumCases, cumCasesPer=burden) %>%
        inner_join(select(cnvDeaths, countyFIPS, stateFIPS, date, cumDeaths, cumDeathPer=burden), 
                   by=c("countyFIPS", "stateFIPS", "date")
                   ) %>%
        mutate(cluster=if(is.null(stateClusters)) NA else stateClusters[state])
    
    # Attach county clusters if requested and if state clusters were not attached
    # County cluster file converted to 5-digit character
    str_pad5 <- function(x) stringr::str_pad(x, width=5, side="left", pad="0")
    if (is.null(stateClusters) & !is.null(countyClusters)) {
        names(countyClusters) <- str_pad5(as.character(names(countyClusters)))
        dfBurden <- dfBurden %>%
            mutate(cluster=countyClusters[str_pad5(countyFIPS)])
    }
    
    # Compare against an old file, if requested
    if (!is.null(oldFile)) {
        p1 <- bind_rows(oldFile, dfBurden, .id="source") %>%
            mutate(source=factor(case_when(source==1 ~ "Previous", source==2 ~ "New", TRUE ~ "Unknown"), 
                                 levels=c("New", "Previous", "Unknown")
            )
            ) %>%
            group_by(source, date) %>%
            summarize(cumDeaths=sum(cumDeaths), cumCases=sum(cumCases)) %>%
            pivot_longer(-c(source, date)) %>%
            ggplot(aes(x=date, y=value/1000, group=source, color=source)) + 
            geom_line() + 
            facet_wrap(~c("cumCases"="Cases", "cumDeaths"="Deaths")[name], scales="free_y") + 
            scale_x_date(date_breaks="1 months", date_labels="%m") + 
            labs(y="Burden (000s)", title="US National Coronavirus Burden by Source")
        print(p1)
    }
    
    if (!is.null(showBurdenMinPop)) {
        plotBurdenData(dfBurden, maxDate=maxDate, minPop=showBurdenMinPop)
    }
    
    # Return the burdens file
    dfBurden
    
}

The updated function is tested against the existing data:

# Locations for the population, case, and death file
popLoc <- "./RInputFiles/Coronavirus/covid_county_population_usafacts.csv"
caseLoc <- "./RInputFiles/Coronavirus/covid_confirmed_usafacts_downloaded_20201215.csv"
deathLoc <- "./RInputFiles/Coronavirus/covid_deaths_usafacts_downloaded_20201215.csv"

# Run old segments against new data
cty_old_20201215_v002 <- readRunUSAFacts(maxDate="2020-12-13", 
                                         popLoc=popLoc, 
                                         caseLoc=caseLoc, 
                                         deathLoc=deathLoc, 
                                         dlCaseDeath=!(file.exists(caseLoc) & file.exists(deathLoc)),
                                         oldFile=readFromRDS("cty_20201026")$dfBurden, 
                                         existingCountyClusters=readFromRDS("cty_new_20201203")$clustVec
                                         )
## 
## -- Column specification --------------------------------------------------------
## cols(
##   countyFIPS = col_double(),
##   `County Name` = col_character(),
##   State = col_character(),
##   population = col_double()
## )
## 
## -- Column specification --------------------------------------------------------
## cols(
##   .default = col_double(),
##   `County Name` = col_character(),
##   State = col_character()
## )
## i Use `spec()` for the full column specifications.
## Rows: 1,044,765
## Columns: 6
## $ countyFIPS <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ countyName <chr> "Statewide Unallocated", "Statewide Unallocated", "State...
## $ state      <chr> "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", "A...
## $ stateFIPS  <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...
## $ date       <date> 2020-01-22, 2020-01-23, 2020-01-24, 2020-01-25, 2020-01...
## $ cumCases   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## Warning: `expand_scale()` is deprecated; use `expansion()` instead.
## Warning: Missing column names filled in: 'X332' [332]
## 
## -- Column specification --------------------------------------------------------
## cols(
##   .default = col_double(),
##   `County Name` = col_character(),
##   State = col_character(),
##   X332 = col_logical()
## )
## i Use `spec()` for the full column specifications.
## Warning: 1 parsing failure.
##  row  col           expected actual                                                                      file
## 3196 X332 1/0/T/F/TRUE/FALSE        './RInputFiles/Coronavirus/covid_deaths_usafacts_downloaded_20201215.csv'
## Warning: Problem with `mutate()` input `date`.
## i  3196 failed to parse.
## i Input `date` is `lubridate::mdy(date)`.
## Warning: 3196 failed to parse.
## Rows: 1,048,288
## Columns: 6
## $ countyFIPS <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ countyName <chr> "Statewide Unallocated", "Statewide Unallocated", "State...
## $ state      <chr> "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", "A...
## $ stateFIPS  <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...
## $ date       <date> 2020-01-22, 2020-01-23, 2020-01-24, 2020-01-25, 2020-01...
## $ cumDeaths  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## Warning: `expand_scale()` is deprecated; use `expansion()` instead.
## # A tibble: 1,027,434 x 11
##    countyFIPS countyName.x state stateFIPS date       cumCases countyName.y
##         <dbl> <chr>        <chr>     <dbl> <date>        <dbl> <chr>       
##  1       1001 Autauga Cou~ AL            1 2020-01-22        0 Autauga Cou~
##  2       1003 Baldwin Cou~ AL            1 2020-01-22        0 Baldwin Cou~
##  3       1005 Barbour Cou~ AL            1 2020-01-22        0 Barbour Cou~
##  4       1007 Bibb County  AL            1 2020-01-22        0 Bibb County 
##  5       1009 Blount Coun~ AL            1 2020-01-22        0 Blount Coun~
##  6       1011 Bullock Cou~ AL            1 2020-01-22        0 Bullock Cou~
##  7       1013 Butler Coun~ AL            1 2020-01-22        0 Butler Coun~
##  8       1015 Calhoun Cou~ AL            1 2020-01-22        0 Calhoun Cou~
##  9       1017 Chambers Co~ AL            1 2020-01-22        0 Chambers Co~
## 10       1019 Cherokee Co~ AL            1 2020-01-22        0 Cherokee Co~
## # ... with 1,027,424 more rows, and 4 more variables: population <dbl>,
## #   burden <dbl>, county <chr>, bold <dbl>
## # A tibble: 1,030,576 x 11
##    countyFIPS countyName.x state stateFIPS date       cumDeaths countyName.y
##         <dbl> <chr>        <chr>     <dbl> <date>         <dbl> <chr>       
##  1       1001 Autauga Cou~ AL            1 2020-01-22         0 Autauga Cou~
##  2       1003 Baldwin Cou~ AL            1 2020-01-22         0 Baldwin Cou~
##  3       1005 Barbour Cou~ AL            1 2020-01-22         0 Barbour Cou~
##  4       1007 Bibb County  AL            1 2020-01-22         0 Bibb County 
##  5       1009 Blount Coun~ AL            1 2020-01-22         0 Blount Coun~
##  6       1011 Bullock Cou~ AL            1 2020-01-22         0 Bullock Cou~
##  7       1013 Butler Coun~ AL            1 2020-01-22         0 Butler Coun~
##  8       1015 Calhoun Cou~ AL            1 2020-01-22         0 Calhoun Cou~
##  9       1017 Chambers Co~ AL            1 2020-01-22         0 Chambers Co~
## 10       1019 Cherokee Co~ AL            1 2020-01-22         0 Cherokee Co~
## # ... with 1,030,566 more rows, and 4 more variables: population <dbl>,
## #   burden <dbl>, county <chr>, bold <dbl>
## # A tibble: 3,142 x 6
##    countyFIPS stateFIPS pop_mean_case pop_sd_case nNamesCase nameCase     
##         <dbl>     <dbl>         <dbl>       <dbl>      <int> <chr>        
##  1       1001         1         55869           0          1 Autauga (AL) 
##  2       1003         1        223234           0          1 Baldwin (AL) 
##  3       1005         1         24686           0          1 Barbour (AL) 
##  4       1007         1         22394           0          1 Bibb (AL)    
##  5       1009         1         57826           0          1 Blount (AL)  
##  6       1011         1         10101           0          1 Bullock (AL) 
##  7       1013         1         19448           0          1 Butler (AL)  
##  8       1015         1        113605           0          1 Calhoun (AL) 
##  9       1017         1         33254           0          1 Chambers (AL)
## 10       1019         1         26196           0          1 Cherokee (AL)
## # ... with 3,132 more rows
## # A tibble: 3,142 x 6
##    countyFIPS stateFIPS pop_mean_death pop_sd_death nNamesDeath nameDeath    
##         <dbl>     <dbl>          <dbl>        <dbl>       <int> <chr>        
##  1       1001         1          55869            0           1 Autauga (AL) 
##  2       1003         1         223234            0           1 Baldwin (AL) 
##  3       1005         1          24686            0           1 Barbour (AL) 
##  4       1007         1          22394            0           1 Bibb (AL)    
##  5       1009         1          57826            0           1 Blount (AL)  
##  6       1011         1          10101            0           1 Bullock (AL) 
##  7       1013         1          19448            0           1 Butler (AL)  
##  8       1015         1         113605            0           1 Calhoun (AL) 
##  9       1017         1          33254            0           1 Chambers (AL)
## 10       1019         1          26196            0           1 Cherokee (AL)
## # ... with 3,132 more rows
## 
## County name mismatches (case name will be used)
## # A tibble: 15 x 5
##    countyFIPS stateFIPS nameCase              nameDeath            pop_mean_case
##         <dbl>     <dbl> <chr>                 <chr>                        <dbl>
##  1       8014         8 Broomfield and City ~ Broomfield (CO)              70465
##  2      27073        27 Lac Qui Parle (MN)    Lac qui Parle (MN)            6623
##  3      51115        51 Matthews (VA)         Mathews (VA)                  8834
##  4      51510        51 Alexandria City (VA)  Alexandria city (VA)        159428
##  5      51540        51 Charlottesville City~ Charlottesville cit~         47266
##  6      51550        51 Chesapeake City (VA)  Chesapeake city (VA)        244835
##  7      51590        51 Danville City (VA)    Danville city (VA)           40044
##  8      51630        51 Fredericksburg City ~ Fredericksburg city~         29036
##  9      51660        51 Harrisonburg City (V~ Harrisonburg city (~         53016
## 10      51683        51 Manassas City (VA)    Manassas city (VA)           41085
## 11      51710        51 Norfolk City (VA)     Norfolk city (VA)           242742
## 12      51740        51 Portsmouth City (VA)  Portsmouth city (VA)         94398
## 13      51760        51 Richmond City (VA)    Richmond city (VA)          230436
## 14      51800        51 Suffolk City (VA)     Suffolk city (VA)            92108
## 15      51810        51 Virginia Beach City ~ Virginia Beach city~        449974
## 
## In cases but not in deaths
## # A tibble: 0 x 11
## # ... with 11 variables: countyFIPS <dbl>, countyName.x <chr>, state <chr>,
## #   stateFIPS <dbl>, date <date>, cumCases <dbl>, countyName.y <chr>,
## #   population <dbl>, burden <dbl>, county <chr>, bold <dbl>
## 
## In deaths but not in cases
## # A tibble: 3,142 x 11
##    countyFIPS countyName.x state stateFIPS date       cumDeaths countyName.y
##         <dbl> <chr>        <chr>     <dbl> <date>         <dbl> <chr>       
##  1       1001 Autauga Cou~ AL            1 NA                NA Autauga Cou~
##  2       1003 Baldwin Cou~ AL            1 NA                NA Baldwin Cou~
##  3       1005 Barbour Cou~ AL            1 NA                NA Barbour Cou~
##  4       1007 Bibb County  AL            1 NA                NA Bibb County 
##  5       1009 Blount Coun~ AL            1 NA                NA Blount Coun~
##  6       1011 Bullock Cou~ AL            1 NA                NA Bullock Cou~
##  7       1013 Butler Coun~ AL            1 NA                NA Butler Coun~
##  8       1015 Calhoun Cou~ AL            1 NA                NA Calhoun Cou~
##  9       1017 Chambers Co~ AL            1 NA                NA Chambers Co~
## 10       1019 Cherokee Co~ AL            1 NA                NA Cherokee Co~
## # ... with 3,132 more rows, and 4 more variables: population <dbl>,
## #   burden <dbl>, county <chr>, bold <dbl>
## `geom_smooth()` using formula 'y ~ x'
## `summarise()` ungrouping output (override with `.groups` argument)

## `summarise()` ungrouping output (override with `.groups` argument)
## 
## Shapes will be created without any floor on the number of cases per million
## Shapes will be created without any floor on the number of deaths per million
## *** Counties with 0 cases/deaths or that fall below the floor for minCase/minDeath ***
## # A tibble: 1 x 4
##   cpm_mean_is0 dpm_mean_is0 dpm_mean_ltDeath cpm_mean_ltCase
##          <dbl>        <dbl>            <dbl>           <dbl>
## 1            0      0.00249                0               0
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` regrouping output by 'cluster' (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` regrouping output by 'date', 'cluster' (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)

## 
## Recency is defined as 2020-11-14 through current
## 
## Recency is defined as 2020-11-14 through current
## `summarise()` regrouping output by 'cluster' (override with `.groups` argument)
## `summarise()` regrouping output by 'cluster' (override with `.groups` argument)
## `summarise()` regrouping output by 'cluster' (override with `.groups` argument)
## `summarise()` regrouping output by 'cluster' (override with `.groups` argument)

## Warning: `expand_scale()` is deprecated; use `expansion()` instead.

## Joining, by = "fipsCounty"
## Joining, by = "fipsCounty"
## `summarise()` regrouping output by 'cluster' (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)

An updated summary of the deaths impact is performed:

deathLoc <- "./RInputFiles/Coronavirus/covid_deaths_usafacts_downloaded_20201215.csv"
rawCountyDeath <- readr::read_csv(deathLoc) 
## Warning: Missing column names filled in: 'X332' [332]
## 
## -- Column specification --------------------------------------------------------
## cols(
##   .default = col_double(),
##   `County Name` = col_character(),
##   State = col_character(),
##   X332 = col_logical()
## )
## i Use `spec()` for the full column specifications.
## Warning: 1 parsing failure.
##  row  col           expected actual                                                                      file
## 3196 X332 1/0/T/F/TRUE/FALSE        './RInputFiles/Coronavirus/covid_deaths_usafacts_downloaded_20201215.csv'
allCountyDeath <- rawCountyDeath %>%
    select(-X332) %>%
    pivot_longer(-c(countyFIPS, `County Name`, State, stateFIPS), names_to="date", values_to="deaths") %>%
    mutate(date=lubridate::mdy(date), 
           countyFIPS=stringr::str_pad(countyFIPS, side="left", width=5, pad="0")
           ) %>%
    filter(date==max(date))
allCountyDeath
## # A tibble: 3,196 x 6
##    countyFIPS `County Name`         State stateFIPS date       deaths
##    <chr>      <chr>                 <chr>     <dbl> <date>      <dbl>
##  1 00000      Statewide Unallocated AL            1 2020-12-13      0
##  2 01001      Autauga County        AL            1 2020-12-13     42
##  3 01003      Baldwin County        AL            1 2020-12-13    141
##  4 01005      Barbour County        AL            1 2020-12-13     30
##  5 01007      Bibb County           AL            1 2020-12-13     40
##  6 01009      Blount County         AL            1 2020-12-13     47
##  7 01011      Bullock County        AL            1 2020-12-13     20
##  8 01013      Butler County         AL            1 2020-12-13     44
##  9 01015      Calhoun County        AL            1 2020-12-13    129
## 10 01017      Chambers County       AL            1 2020-12-13     55
## # ... with 3,186 more rows
countyAll <- cty_old_20201215_v002$burdenData %>% 
    group_by(county, countyFIPS) %>% 
    summarize(pop=max(population, na.rm=TRUE), 
              cases=max(cumCases, na.rm=TRUE), 
              deaths=max(cumDeaths, na.rm=TRUE), 
              .groups="drop"
              ) %>%
    mutate(countyFIPS=stringr::str_pad(countyFIPS, width=5, side="left", pad="0"))
countyAll
## # A tibble: 3,142 x 5
##    county             countyFIPS    pop cases deaths
##    <chr>              <chr>       <dbl> <dbl>  <dbl>
##  1 Abbeville (SC)     45001       24527  1108     23
##  2 Acadia Parish (LA) 22001       62045  4522    127
##  3 Accomack (VA)      51001       32316  1494     21
##  4 Ada (ID)           16001      481587 31227    285
##  5 Adair (IA)         19001        7152   509     16
##  6 Adair (KY)         21001       19202   979     34
##  7 Adair (MO)         29001       25343  1297      3
##  8 Adair (OK)         40001       22194  1482     13
##  9 Adams (CO)         08001      517421 35145    425
## 10 Adams (IA)         19003        3602   212      2
## # ... with 3,132 more rows
# Records in the processed deaths file but not in the raw file (should be empty)
countyAll %>%
    anti_join(allCountyDeath, by="countyFIPS")
## # A tibble: 0 x 5
## # ... with 5 variables: county <chr>, countyFIPS <chr>, pop <dbl>, cases <dbl>,
## #   deaths <dbl>
# Records in the raw deaths file but not in the processed deaths file
mismatch_1 <- allCountyDeath %>%
    anti_join(countyAll, by="countyFIPS")
mismatch_1 %>%
    filter(deaths > 0) %>%
    arrange(-deaths) %>%
    as.data.frame()
##    countyFIPS                        County Name State stateFIPS       date
## 1       00000              Statewide Unallocated    KS        20 2020-12-13
## 2       00000              Statewide Unallocated    GA        13 2020-12-13
## 3       00000              Statewide Unallocated    NY        36 2020-12-13
## 4       00000              Statewide Unallocated    MI        26 2020-12-13
## 5       00000              Statewide Unallocated    TN        47 2020-12-13
## 6       00000              Statewide Unallocated    MD        24 2020-12-13
## 7       00001 New York City Unallocated/Probable    NY        36 2020-12-13
## 8       00000              Statewide Unallocated    MA        25 2020-12-13
## 9       00000              Statewide Unallocated    RI        44 2020-12-13
## 10      00000              Statewide Unallocated    UT        49 2020-12-13
## 11      00000              Statewide Unallocated    WA        53 2020-12-13
## 12      00000              Statewide Unallocated    HI        15 2020-12-13
## 13      00000              Statewide Unallocated    MT        30 2020-12-13
## 14      00000              Statewide Unallocated    CT         9 2020-12-13
## 15      00000              Statewide Unallocated    NE        31 2020-12-13
##    deaths
## 1     254
## 2     229
## 3     137
## 4     116
## 5      97
## 6      67
## 7      45
## 8       6
## 9       5
## 10      5
## 11      3
## 12      2
## 13      2
## 14      1
## 15      1
mismatch_1 %>%
    mutate(unalloc=ifelse(stringr::str_detect(string=`County Name`, pattern="nallocated"), 
                          "unallocated", 
                          "specific"
                          )
           ) %>%
    filter(deaths > 0) %>%
    pivot_wider(id_cols=c(State), 
                names_from=unalloc, 
                values_from=deaths, 
                values_fn=sum, 
                values_fill=0
                ) %>%
    arrange(-unallocated)
## # A tibble: 14 x 2
##    State unallocated
##    <chr>       <dbl>
##  1 KS            254
##  2 GA            229
##  3 NY            182
##  4 MI            116
##  5 TN             97
##  6 MD             67
##  7 MA              6
##  8 RI              5
##  9 UT              5
## 10 WA              3
## 11 HI              2
## 12 MT              2
## 13 CT              1
## 14 NE              1

There is no longer a disconnect based on mismatched county names. The missing deaths are merely from the statewide unallocated pool.

New data are downloaded from COVID Tracking Project, with existing segments used:

# Use existing segments with updated data
locDownload <- "./RInputFiles/Coronavirus/CV_downloaded_201224.csv"
old_hier5_201224 <- readRunCOVIDTrackingProject(thruLabel="Dec 23, 2020", 
                                                downloadTo=if(file.exists(locDownload)) NULL else locDownload,
                                                readFrom=locDownload, 
                                                compareFile=readFromRDS("test_hier5_201025")$dfRaw,
                                                useClusters=readFromRDS("test_hier5_201130")$useClusters
                                                )
## 
## -- Column specification --------------------------------------------------------
## cols(
##   .default = col_double(),
##   state = col_character(),
##   totalTestResultsSource = col_character(),
##   dataQualityGrade = col_character(),
##   lastUpdateEt = col_character(),
##   dateModified = col_datetime(format = ""),
##   checkTimeEt = col_character(),
##   dateChecked = col_datetime(format = ""),
##   fips = col_character(),
##   hash = col_character(),
##   grade = col_logical()
## )
## i Use `spec()` for the full column specifications.
## 
## File is unique by state and date
## 
## 
## Overall control totals in file:
## # A tibble: 1 x 3
##   positiveIncrease deathIncrease hospitalizedCurrently
##              <dbl>         <dbl>                 <dbl>
## 1         18238749        317513              13690224
## 
## *** COMPARISONS TO REFERENCE FILE: compareFile
## 
## Checkin for similarity of: column names
## In reference but not in current: 
## In current but not in reference: 
## 
## Checkin for similarity of: states
## In reference but not in current: 
## In current but not in reference: 
## 
## Checkin for similarity of: dates
## In reference but not in current: 
## In current but not in reference: 2020-12-23 2020-12-22 2020-12-21 2020-12-20 2020-12-19 2020-12-18 2020-12-17 2020-12-16 2020-12-15 2020-12-14 2020-12-13 2020-12-12 2020-12-11 2020-12-10 2020-12-09 2020-12-08 2020-12-07 2020-12-06 2020-12-05 2020-12-04 2020-12-03 2020-12-02 2020-12-01 2020-11-30 2020-11-29 2020-11-28 2020-11-27 2020-11-26 2020-11-25 2020-11-24 2020-11-23 2020-11-22 2020-11-21 2020-11-20 2020-11-19 2020-11-18 2020-11-17 2020-11-16 2020-11-15 2020-11-14 2020-11-13 2020-11-12 2020-11-11 2020-11-10 2020-11-09 2020-11-08 2020-11-07 2020-11-06 2020-11-05 2020-11-04 2020-11-03 2020-11-02 2020-11-01 2020-10-31 2020-10-30 2020-10-29 2020-10-28 2020-10-27 2020-10-26 2020-10-25 2020-01-21 2020-01-20 2020-01-19 2020-01-18 2020-01-17 2020-01-16 2020-01-15 2020-01-14 2020-01-13
## 
## *** Difference of at least 5 and difference is at least 1%:
## Joining, by = c("date", "name")
##           date                  name newValue oldValue
## 1   2020-02-29      positiveIncrease        3       18
## 2   2020-03-01      positiveIncrease        8       16
## 3   2020-03-02      positiveIncrease       30       44
## 4   2020-03-03      positiveIncrease       42       48
## 5   2020-03-05      positiveIncrease       59      103
## 6   2020-03-06      positiveIncrease      127      109
## 7   2020-03-07      positiveIncrease      142      176
## 8   2020-03-08      positiveIncrease      173      198
## 9   2020-03-09      positiveIncrease      265      292
## 10  2020-03-10      positiveIncrease      392      387
## 11  2020-03-11      positiveIncrease      425      509
## 12  2020-03-13      positiveIncrease      849     1072
## 13  2020-03-14      positiveIncrease      993      924
## 14  2020-03-15      positiveIncrease     1323     1291
## 15  2020-03-16      positiveIncrease     1703     1739
## 16  2020-03-17      positiveIncrease     2093     2588
## 17  2020-03-18      positiveIncrease     3375     3089
## 18  2020-03-19      positiveIncrease     4601     4651
## 19  2020-03-21      positiveIncrease     6906     6793
## 20  2020-03-21 hospitalizedCurrently     1492     1436
## 21  2020-03-22      positiveIncrease     9270     9125
## 22  2020-03-23      positiveIncrease    11175    11439
## 23  2020-03-23 hospitalizedCurrently     2812     2770
## 24  2020-03-25      positiveIncrease    12613    12908
## 25  2020-03-25 hospitalizedCurrently     5140     5062
## 26  2020-03-28         deathIncrease      551      544
## 27  2020-03-29      positiveIncrease    19668    19348
## 28  2020-03-29         deathIncrease      504      515
## 29  2020-03-30      positiveIncrease    21191    22042
## 30  2020-03-31         deathIncrease      907      890
## 31  2020-04-01      positiveIncrease    26233    25791
## 32  2020-04-05      positiveIncrease    25879    25500
## 33  2020-04-06      positiveIncrease    28238    29002
## 34  2020-04-07      positiveIncrease    30461    30885
## 35  2020-04-09      positiveIncrease    35140    34503
## 36  2020-04-10      positiveIncrease    33526    34380
## 37  2020-04-10         deathIncrease     2072     2108
## 38  2020-04-11      positiveIncrease    31313    30501
## 39  2020-04-11         deathIncrease     2079     2054
## 40  2020-04-12      positiveIncrease    28221    27784
## 41  2020-04-13      positiveIncrease    24182    25195
## 42  2020-04-15      positiveIncrease    29902    30307
## 43  2020-04-16      positiveIncrease    31570    30978
## 44  2020-04-21      positiveIncrease    26039    26367
## 45  2020-04-23         deathIncrease     1814     1791
## 46  2020-04-24         deathIncrease     1972     1895
## 47  2020-04-25         deathIncrease     1627     1748
## 48  2020-04-27      positiveIncrease    22392    22708
## 49  2020-04-27         deathIncrease     1287     1270
## 50  2020-04-29         deathIncrease     2685     2713
## 51  2020-05-01         deathIncrease     1808     1779
## 52  2020-05-02         deathIncrease     1531     1562
## 53  2020-05-04      positiveIncrease    22228    22649
## 54  2020-05-05         deathIncrease     2494     2452
## 55  2020-05-06         deathIncrease     1916     1948
## 56  2020-05-07      positiveIncrease    27221    27537
## 57  2020-05-11      positiveIncrease    18128    18377
## 58  2020-05-12      positiveIncrease    22369    22890
## 59  2020-05-12         deathIncrease     1506     1486
## 60  2020-05-13      positiveIncrease    21614    21285
## 61  2020-05-13         deathIncrease     1734     1704
## 62  2020-05-14         deathIncrease     1852     1879
## 63  2020-05-15      positiveIncrease    25471    24685
## 64  2020-05-15         deathIncrease     1535     1507
## 65  2020-05-16      positiveIncrease    23739    24702
## 66  2020-05-16         deathIncrease     1237      987
## 67  2020-05-17      positiveIncrease    20370    20009
## 68  2020-05-17         deathIncrease      873      849
## 69  2020-05-18      positiveIncrease    20643    21028
## 70  2020-05-19      positiveIncrease    20648    20897
## 71  2020-05-21         deathIncrease     1377     1394
## 72  2020-05-22      positiveIncrease    24128    24433
## 73  2020-05-22         deathIncrease     1291     1341
## 74  2020-05-23      positiveIncrease    22513    21531
## 75  2020-05-23         deathIncrease     1038     1063
## 76  2020-05-24      positiveIncrease    19072    20072
## 77  2020-05-24         deathIncrease      689      680
## 78  2020-05-26         deathIncrease      665      645
## 79  2020-05-27      positiveIncrease    19209    19447
## 80  2020-05-27         deathIncrease     1335     1321
## 81  2020-05-29         deathIncrease     1171     1184
## 82  2020-06-01      positiveIncrease    20101    20485
## 83  2020-06-01         deathIncrease      680      668
## 84  2020-06-02      positiveIncrease    19840    20109
## 85  2020-06-02         deathIncrease      973      962
## 86  2020-06-03      positiveIncrease    20182    20390
## 87  2020-06-03         deathIncrease      974      993
## 88  2020-06-04      positiveIncrease    20470    20886
## 89  2020-06-04         deathIncrease      881      893
## 90  2020-06-05      positiveIncrease    23055    23394
## 91  2020-06-05         deathIncrease      837      826
## 92  2020-06-06      positiveIncrease    22723    23064
## 93  2020-06-06         deathIncrease      714      728
## 94  2020-06-07      positiveIncrease    19101    18740
## 95  2020-06-08      positiveIncrease    16883    17209
## 96  2020-06-08         deathIncrease      674      661
## 97  2020-06-09      positiveIncrease    16853    17312
## 98  2020-06-09         deathIncrease      891      902
## 99  2020-06-12      positiveIncrease    23079    23597
## 100 2020-06-12         deathIncrease      766      775
## 101 2020-06-15      positiveIncrease    18330    18649
## 102 2020-06-15         deathIncrease      387      381
## 103 2020-06-16      positiveIncrease    22910    23478
## 104 2020-06-16         deathIncrease      718      730
## 105 2020-06-17         deathIncrease      779      767
## 106 2020-06-18      positiveIncrease    27046    27746
## 107 2020-06-18         deathIncrease      685      705
## 108 2020-06-19      positiveIncrease    30902    31471
## 109 2020-06-20         deathIncrease      615      629
## 110 2020-06-21      positiveIncrease    29180    27928
## 111 2020-06-22      positiveIncrease    26775    27281
## 112 2020-06-23         deathIncrease      722      710
## 113 2020-06-24         deathIncrease      707      724
## 114 2020-06-26         deathIncrease      621      637
## 115 2020-06-27         deathIncrease      503      511
## 116 2020-06-29      positiveIncrease    39279    39813
## 117 2020-06-29         deathIncrease      338      332
## 118 2020-06-30      positiveIncrease    46994    47864
## 119 2020-06-30         deathIncrease      580      596
## 120 2020-07-04         deathIncrease      300      306
## 121 2020-07-06      positiveIncrease    40766    41959
## 122 2020-07-06         deathIncrease      235      243
## 123 2020-07-07      positiveIncrease    50996    51687
## 124 2020-07-07         deathIncrease      910      923
## 125 2020-07-08         deathIncrease      818      807
## 126 2020-07-10         deathIncrease      835      854
## 127 2020-07-13      positiveIncrease    57075    58133
## 128 2020-07-14      positiveIncrease    58646    62687
## 129 2020-07-14         deathIncrease      745      736
## 130 2020-07-15      positiveIncrease    69232    65797
## 131 2020-07-20         deathIncrease      375      363
## 132 2020-07-21      positiveIncrease    62884    63930
## 133 2020-07-22         deathIncrease     1142     1171
## 134 2020-07-23         deathIncrease     1074     1056
## 135 2020-07-25         deathIncrease     1009     1023
## 136 2020-07-27      positiveIncrease    54515    55332
## 137 2020-07-28      positiveIncrease    58412    59003
## 138 2020-07-30         deathIncrease     1245     1259
## 139 2020-07-31         deathIncrease     1329     1312
## 140 2020-08-02      positiveIncrease    53220    46812
## 141 2020-08-03      positiveIncrease    42698    49713
## 142 2020-08-04      positiveIncrease    51214    51866
## 143 2020-08-06         deathIncrease     1237     1251
## 144 2020-08-10      positiveIncrease    41304    42089
## 145 2020-08-11      positiveIncrease    54958    55701
## 146 2020-08-14      positiveIncrease    57102    55636
## 147 2020-08-18      positiveIncrease    40057    40795
## 148 2020-08-20         deathIncrease     1122     1134
## 149 2020-08-25      positiveIncrease    36804    36379
## 150 2020-08-29      positiveIncrease    43897    44501
## 151 2020-08-30      positiveIncrease    38765    39501
## 152 2020-08-31         deathIncrease      377      366
## 153 2020-09-07      positiveIncrease    28047    28682
## 154 2020-09-08         deathIncrease      350      358
## 155 2020-09-09      positiveIncrease    30616    31114
## 156 2020-09-10         deathIncrease     1156     1170
## 157 2020-09-12         deathIncrease      821      810
## 158 2020-09-14      positiveIncrease    33504    33864
## 159 2020-09-15      positiveIncrease    34756    35445
## 160 2020-09-16         deathIncrease     1184     1200
## 161 2020-09-17         deathIncrease      878      863
## 162 2020-09-19         deathIncrease      751      740
## 163 2020-09-20      positiveIncrease    35442    36295
## 164 2020-09-23      positiveIncrease    39288    38567
## 165 2020-09-24         deathIncrease      933      921
## 166 2020-09-26      positiveIncrease    47007    47856
## 167 2020-09-27      positiveIncrease    34900    35454
## 168 2020-09-28      positiveIncrease    35251    36524
## 169 2020-09-28         deathIncrease      245      257
## 170 2020-10-02         deathIncrease      844      835
## 171 2020-10-03      positiveIncrease    50823    51372
## 172 2020-10-04      positiveIncrease    38001    38439
## 173 2020-10-04         deathIncrease      373      363
## 174 2020-10-05      positiveIncrease    37704    38133
## 175 2020-10-06         deathIncrease      622      634
## 176 2020-10-09         deathIncrease      905      893
## 177 2020-10-10         deathIncrease      690      665
## 178 2020-10-13      positiveIncrease    46930    48387
## 179 2020-10-13         deathIncrease      724      690
## 180 2020-10-14         deathIncrease      798      811
## 181 2020-10-15         deathIncrease      928      951
## 182 2020-10-16         deathIncrease      894      877
## 183 2020-10-18      positiveIncrease    47637    48922
## 184 2020-10-18         deathIncrease      402      393
## 185 2020-10-19         deathIncrease      451      456
## 186 2020-10-21      positiveIncrease    61404    58606
## 187 2020-10-22      positiveIncrease    73197    75248
## 188 2020-10-22         deathIncrease     1126     1143
## 189 2020-10-23         deathIncrease      941      916
## 190 2020-10-24         deathIncrease      897      885
## Joining, by = c("date", "name")
## Warning: Removed 69 row(s) containing missing values (geom_path).
## 
## 
## *** Difference of at least 5 and difference is at least 1%:
## Joining, by = c("state", "name")
##    state                  name newValue oldValue
## 1     AK      positiveIncrease    12523    13535
## 2     CO      positiveIncrease    93398    91570
## 3     CO         deathIncrease     2218     2076
## 4     FL      positiveIncrease   766305   776249
## 5     ND         deathIncrease      453      345
## 6     NM      positiveIncrease    41040    40168
## 7     NM hospitalizedCurrently    27399    27120
## 8     PR      positiveIncrease    31067    61275
## 9     RI      positiveIncrease    30581    30116
## 10    WA      positiveIncrease   105278   101345
## 11    WA hospitalizedCurrently    92643    69716
## Rows: 16,595
## Columns: 55
## $ date                        <date> 2020-12-23, 2020-12-23, 2020-12-23, 20...
## $ state                       <chr> "AK", "AL", "AR", "AS", "AZ", "CA", "CO...
## $ positive                    <dbl> 43361, 334569, 207941, 0, 473273, 19640...
## $ probableCases               <dbl> NA, 63069, 33159, NA, 20593, NA, 13007,...
## $ negative                    <dbl> 1178666, 1541851, 1796073, 2140, 224533...
## $ pending                     <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
## $ totalTestResultsSource      <chr> "totalTestsViral", "totalTestsPeopleVir...
## $ totalTestResults            <dbl> 1222027, 1813351, 1970855, 2140, 473226...
## $ hospitalizedCurrently       <dbl> 111, 2535, 1110, NA, 4163, 19361, 1336,...
## $ hospitalizedCumulative      <dbl> 961, 31651, 10826, NA, 34112, NA, 17642...
## $ inIcuCurrently              <dbl> NA, NA, 340, NA, 972, 3955, NA, NA, 76,...
## $ inIcuCumulative             <dbl> NA, 2412, NA, NA, NA, NA, NA, NA, NA, N...
## $ onVentilatorCurrently       <dbl> 15, NA, 174, NA, 673, NA, NA, NA, 27, N...
## $ onVentilatorCumulative      <dbl> NA, 1385, 1163, NA, NA, NA, NA, NA, NA,...
## $ recovered                   <dbl> 7165, 193149, 182024, NA, 69298, NA, 16...
## $ dataQualityGrade            <chr> "A", "A", "A+", "D", "A+", "B", "A", "B...
## $ lastUpdateEt                <chr> "12/23/2020 03:59", "12/23/2020 11:00",...
## $ dateModified                <dttm> 2020-12-23 03:59:00, 2020-12-23 11:00:...
## $ checkTimeEt                 <chr> "12/22 22:59", "12/23 06:00", "12/22 19...
## $ death                       <dbl> 197, 4587, 3376, 0, 8179, 23284, 4462, ...
## $ hospitalized                <dbl> 961, 31651, 10826, NA, 34112, NA, 17642...
## $ dateChecked                 <dttm> 2020-12-23 03:59:00, 2020-12-23 11:00:...
## $ totalTestsViral             <dbl> 1222027, NA, 1970855, 2140, 4732267, 30...
## $ positiveTestsViral          <dbl> 51776, NA, NA, NA, NA, NA, NA, NA, NA, ...
## $ negativeTestsViral          <dbl> 1168903, NA, 1796073, NA, NA, NA, NA, N...
## $ positiveCasesViral          <dbl> NA, 271500, 174782, 0, 452680, 1964076,...
## $ deathConfirmed              <dbl> 197, 4023, 2939, NA, 7434, NA, 3831, 46...
## $ deathProbable               <dbl> NA, 564, 437, NA, 745, NA, 631, 1095, N...
## $ totalTestEncountersViral    <dbl> NA, NA, NA, NA, NA, NA, 4153784, NA, 84...
## $ totalTestsPeopleViral       <dbl> NA, 1813351, NA, NA, 2698015, NA, 20701...
## $ totalTestsAntibody          <dbl> NA, NA, NA, NA, 385614, NA, 265742, NA,...
## $ positiveTestsAntibody       <dbl> NA, NA, NA, NA, NA, NA, 27602, NA, NA, ...
## $ negativeTestsAntibody       <dbl> NA, NA, NA, NA, NA, NA, 235832, NA, NA,...
## $ totalTestsPeopleAntibody    <dbl> NA, 83461, NA, NA, NA, NA, NA, NA, NA, ...
## $ positiveTestsPeopleAntibody <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
## $ negativeTestsPeopleAntibody <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
## $ totalTestsPeopleAntigen     <dbl> NA, NA, 216498, NA, NA, NA, NA, NA, NA,...
## $ positiveTestsPeopleAntigen  <dbl> NA, NA, 39849, NA, NA, NA, NA, NA, NA, ...
## $ totalTestsAntigen           <dbl> NA, NA, 21856, NA, NA, NA, NA, 49816, N...
## $ positiveTestsAntigen        <dbl> NA, NA, 3300, NA, NA, NA, NA, NA, NA, N...
## $ fips                        <chr> "02", "01", "05", "60", "04", "06", "08...
## $ positiveIncrease            <dbl> 360, 4758, 2893, 0, 6058, 39069, 2948, ...
## $ negativeIncrease            <dbl> 6385, 9211, 10877, 0, 11584, 239309, 10...
## $ total                       <dbl> 1222027, 1876420, 2004014, 2140, 271860...
## $ totalTestResultsIncrease    <dbl> 6745, 12372, 12877, 0, 42251, 278378, 4...
## $ posNeg                      <dbl> 1222027, 1876420, 2004014, 2140, 271860...
## $ deathIncrease               <dbl> 3, 135, 38, 0, 54, 361, 93, 33, 7, 0, 1...
## $ hospitalizedIncrease        <dbl> 13, 346, 175, 0, 493, 0, 161, 0, 0, 0, ...
## $ hash                        <chr> "559dd0f20fea1e35ee86562d47dc2b1046727e...
## $ commercialScore             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ negativeRegularScore        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ negativeScore               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ positiveScore               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ score                       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ grade                       <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
## 
## 
## Control totals - note that validState other than TRUE will be discarded
## 
## # A tibble: 2 x 6
##   validState    cases deaths  hosp     tests     n
##   <lgl>         <dbl>  <dbl> <dbl>     <dbl> <dbl>
## 1 FALSE         79802   1554    NA    525297  1415
## 2 TRUE       18158947 315959    NA 237123094 15180
## Rows: 15,180
## Columns: 6
## $ date   <date> 2020-12-23, 2020-12-23, 2020-12-23, 2020-12-23, 2020-12-23,...
## $ state  <chr> "AK", "AL", "AR", "AZ", "CA", "CO", "CT", "DC", "DE", "FL", ...
## $ cases  <dbl> 360, 4758, 2893, 6058, 39069, 2948, 1745, 326, 612, 11100, 5...
## $ deaths <dbl> 3, 135, 38, 54, 361, 93, 33, 7, 0, 121, 56, 3, 15, 12, 171, ...
## $ hosp   <dbl> 111, 2535, 1110, 4163, 19361, 1336, 1155, 253, 454, 5590, 41...
## $ tests  <dbl> 6745, 12372, 12877, 42251, 278378, 43042, 28877, 9790, 3382,...
## Rows: 15,180
## Columns: 14
## $ date   <date> 2020-01-13, 2020-01-14, 2020-01-15, 2020-01-16, 2020-01-17,...
## $ state  <chr> "WA", "WA", "WA", "WA", "WA", "WA", "WA", "WA", "WA", "MA", ...
## $ cases  <dbl> 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ deaths <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ hosp   <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
## $ tests  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, ...
## $ cpm    <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.000...
## $ dpm    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ hpm    <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
## $ tpm    <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.000...
## $ cpm7   <dbl> NA, NA, NA, 0.01992331, 0.01992331, 0.03984662, 0.03984662, ...
## $ dpm7   <dbl> NA, NA, NA, 0, 0, 0, 0, 0, 0, NA, 0, NA, 0, NA, 0, 0, 0, 0, ...
## $ hpm7   <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
## $ tpm7   <dbl> NA, NA, NA, 0.00000000, 0.00000000, 0.00000000, 0.00000000, ...
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` regrouping output by 'date', 'cluster' (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)

## 
## Recency is defined as 2020-11-24 through current
## 
## Recency is defined as 2020-11-24 through current

## `summarise()` regrouping output by 'state', 'cluster', 'date' (override with `.groups` argument)

## `summarise()` ungrouping output (override with `.groups` argument)

## `summarise()` ungrouping output (override with `.groups` argument)

## `summarise()` ungrouping output (override with `.groups` argument)

saveToRDS(old_hier5_201224, ovrWriteError=FALSE)
## 
## File already exists: ./RInputFiles/Coronavirus/old_hier5_201224.RDS 
## 
## Not replacing the existing file since ovrWrite=FALSE
## NULL

New data are downloaded from USA Facts, with existing segments used:

# Locations for the population, case, and death file
popLoc <- "./RInputFiles/Coronavirus/covid_county_population_usafacts.csv"
caseLoc <- "./RInputFiles/Coronavirus/covid_confirmed_usafacts_downloaded_20201225.csv"
deathLoc <- "./RInputFiles/Coronavirus/covid_deaths_usafacts_downloaded_20201225.csv"

# Run old segments against new data
cty_old_20201225 <- readRunUSAFacts(maxDate="2020-12-23", 
                                    popLoc=popLoc, 
                                    caseLoc=caseLoc, 
                                    deathLoc=deathLoc, 
                                    dlCaseDeath=!(file.exists(caseLoc) & file.exists(deathLoc)),
                                    oldFile=readFromRDS("cty_20201026")$dfBurden, 
                                    existingCountyClusters=readFromRDS("cty_new_20201203")$clustVec
                                    )
## 
## -- Column specification --------------------------------------------------------
## cols(
##   countyFIPS = col_double(),
##   `County Name` = col_character(),
##   State = col_character(),
##   population = col_double()
## )
## 
## -- Column specification --------------------------------------------------------
## cols(
##   .default = col_double(),
##   `County Name` = col_character(),
##   State = col_character()
## )
## i Use `spec()` for the full column specifications.
## Rows: 1,076,715
## Columns: 6
## $ countyFIPS <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ countyName <chr> "Statewide Unallocated", "Statewide Unallocated", "State...
## $ state      <chr> "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", "A...
## $ stateFIPS  <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...
## $ date       <date> 2020-01-22, 2020-01-23, 2020-01-24, 2020-01-25, 2020-01...
## $ cumCases   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## Warning: `expand_scale()` is deprecated; use `expansion()` instead.
## 
## -- Column specification --------------------------------------------------------
## cols(
##   .default = col_double(),
##   `County Name` = col_character(),
##   State = col_character()
## )
## i Use `spec()` for the full column specifications.
## Rows: 1,076,715
## Columns: 6
## $ countyFIPS <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ countyName <chr> "Statewide Unallocated", "Statewide Unallocated", "State...
## $ state      <chr> "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", "A...
## $ stateFIPS  <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...
## $ date       <date> 2020-01-22, 2020-01-23, 2020-01-24, 2020-01-25, 2020-01...
## $ cumDeaths  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## Warning: `expand_scale()` is deprecated; use `expansion()` instead.
## 
## County name mismatches (case name will be used)
## # A tibble: 15 x 5
##    countyFIPS stateFIPS nameCase              nameDeath            pop_mean_case
##         <dbl>     <dbl> <chr>                 <chr>                        <dbl>
##  1       8014         8 Broomfield and City ~ Broomfield (CO)              70465
##  2      27073        27 Lac Qui Parle (MN)    Lac qui Parle (MN)            6623
##  3      51115        51 Matthews (VA)         Mathews (VA)                  8834
##  4      51510        51 Alexandria City (VA)  Alexandria city (VA)        159428
##  5      51540        51 Charlottesville City~ Charlottesville cit~         47266
##  6      51550        51 Chesapeake City (VA)  Chesapeake city (VA)        244835
##  7      51590        51 Danville City (VA)    Danville city (VA)           40044
##  8      51630        51 Fredericksburg City ~ Fredericksburg city~         29036
##  9      51660        51 Harrisonburg City (V~ Harrisonburg city (~         53016
## 10      51683        51 Manassas City (VA)    Manassas city (VA)           41085
## 11      51710        51 Norfolk City (VA)     Norfolk city (VA)           242742
## 12      51740        51 Portsmouth City (VA)  Portsmouth city (VA)         94398
## 13      51760        51 Richmond City (VA)    Richmond city (VA)          230436
## 14      51800        51 Suffolk City (VA)     Suffolk city (VA)            92108
## 15      51810        51 Virginia Beach City ~ Virginia Beach city~        449974
## 
## In cases but not in deaths
## # A tibble: 0 x 11
## # ... with 11 variables: countyFIPS <dbl>, countyName.x <chr>, state <chr>,
## #   stateFIPS <dbl>, date <date>, cumCases <dbl>, countyName.y <chr>,
## #   population <dbl>, burden <dbl>, county <chr>, bold <dbl>
## 
## In deaths but not in cases
## # A tibble: 0 x 11
## # ... with 11 variables: countyFIPS <dbl>, countyName.x <chr>, state <chr>,
## #   stateFIPS <dbl>, date <date>, cumDeaths <dbl>, countyName.y <chr>,
## #   population <dbl>, burden <dbl>, county <chr>, bold <dbl>
## `geom_smooth()` using formula 'y ~ x'
## `summarise()` ungrouping output (override with `.groups` argument)

## `summarise()` ungrouping output (override with `.groups` argument)
## 
## Shapes will be created without any floor on the number of cases per million
## Shapes will be created without any floor on the number of deaths per million
## *** Counties with 0 cases/deaths or that fall below the floor for minCase/minDeath ***
## # A tibble: 1 x 4
##   cpm_mean_is0 dpm_mean_is0 dpm_mean_ltDeath cpm_mean_ltCase
##          <dbl>        <dbl>            <dbl>           <dbl>
## 1            0      0.00125                0               0
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` regrouping output by 'cluster' (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` regrouping output by 'date', 'cluster' (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)

## 
## Recency is defined as 2020-11-24 through current
## 
## Recency is defined as 2020-11-24 through current
## `summarise()` regrouping output by 'cluster' (override with `.groups` argument)
## `summarise()` regrouping output by 'cluster' (override with `.groups` argument)
## `summarise()` regrouping output by 'cluster' (override with `.groups` argument)
## `summarise()` regrouping output by 'cluster' (override with `.groups` argument)

## Warning: `expand_scale()` is deprecated; use `expansion()` instead.

## Joining, by = "fipsCounty"
## Joining, by = "fipsCounty"
## `summarise()` regrouping output by 'cluster' (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)

saveToRDS(cty_old_20201225, ovrWriteError=FALSE)
## 
## File already exists: ./RInputFiles/Coronavirus/cty_old_20201225.RDS 
## 
## Not replacing the existing file since ovrWrite=FALSE
## NULL

New data are downloaded from CDC:

# Download new data
cdcLoc <- "Weekly_counts_of_deaths_by_jurisdiction_and_age_group_downloaded_20201226.csv"
cdcList_20201226 <- readRunCDCAllCause(loc=cdcLoc, 
                                       startYear=2015, 
                                       curYear=2020,
                                       weekThru=42, 
                                       startWeek=9, 
                                       lst=old_hier5_201224, 
                                       epiMap=readFromRDS("epiMonth"), 
                                       agePopData=readFromRDS("usPopBucket2020"), 
                                       cvDeathThru="2020-10-17", 
                                       cdcPlotStartWeek=10, 
                                       dlData=!file.exists(paste0("./RInputFiles/Coronavirus/", cdcLoc)), 
                                       stateNoCheck=c("NC")
                                       )
## Rows: 182,606
## Columns: 11
## $ Jurisdiction         <chr> "Alabama", "Alabama", "Alabama", "Alabama", "A...
## $ `Week Ending Date`   <chr> "01/10/2015", "01/17/2015", "01/24/2015", "01/...
## $ `State Abbreviation` <chr> "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL"...
## $ Year                 <int> 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015...
## $ Week                 <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,...
## $ `Age Group`          <chr> "25-44 years", "25-44 years", "25-44 years", "...
## $ `Number of Deaths`   <dbl> 67, 49, 55, 59, 47, 59, 41, 47, 59, 57, 54, 50...
## $ `Time Period`        <chr> "2015-2019", "2015-2019", "2015-2019", "2015-2...
## $ Type                 <chr> "Predicted (weighted)", "Predicted (weighted)"...
## $ Suppress             <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## $ Note                 <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## Rows: 182,606
## Columns: 11
## $ Jurisdiction <chr> "Alabama", "Alabama", "Alabama", "Alabama", "Alabama",...
## $ weekEnding   <date> 2015-01-10, 2015-01-17, 2015-01-24, 2015-01-31, 2015-...
## $ state        <chr> "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", ...
## $ year         <int> 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015, ...
## $ week         <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,...
## $ age          <chr> "25-44 years", "25-44 years", "25-44 years", "25-44 ye...
## $ deaths       <dbl> 67, 49, 55, 59, 47, 59, 41, 47, 59, 57, 54, 50, 58, 42...
## $ period       <chr> "2015-2019", "2015-2019", "2015-2019", "2015-2019", "2...
## $ type         <chr> "Predicted (weighted)", "Predicted (weighted)", "Predi...
## $ Suppress     <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## $ Note         <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## 
## Check Control Levels and Record Counts for Renamed Data:
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 6 x 4
##   age                    n n_deaths_na   deaths
##   <chr>              <int>       <int>    <dbl>
## 1 25-44 years        27375           7  3360351
## 2 45-64 years        33398          14 13127173
## 3 65-74 years        33382          13 13067870
## 4 75-84 years        33407          16 16243292
## 5 85 years and older 33395          17 21145778
## 6 Under 25 years     21649           2  1438361
## `summarise()` regrouping output by 'period', 'year' (override with `.groups` argument)
## # A tibble: 12 x 6
## # Groups:   period, year [6]
##    period     year type                     n n_deaths_na  deaths
##    <chr>     <int> <chr>                <int>       <int>   <dbl>
##  1 2015-2019  2015 Predicted (weighted) 15286           0 5416402
##  2 2015-2019  2015 Unweighted           15286           0 5416402
##  3 2015-2019  2016 Predicted (weighted) 15366           0 5483777
##  4 2015-2019  2016 Unweighted           15366           0 5483777
##  5 2015-2019  2017 Predicted (weighted) 15317           0 5643340
##  6 2015-2019  2017 Unweighted           15317           0 5643340
##  7 2015-2019  2018 Predicted (weighted) 15307           0 5698025
##  8 2015-2019  2018 Unweighted           15307           0 5698025
##  9 2015-2019  2019 Predicted (weighted) 15317           0 5725493
## 10 2015-2019  2019 Unweighted           15317           0 5725493
## 11 2020       2020 Predicted (weighted) 14735          40 6286267
## 12 2020       2020 Unweighted           14685          29 6162484
## `summarise()` regrouping output by 'period' (override with `.groups` argument)
## # A tibble: 3 x 5
## # Groups:   period [2]
##   period    Suppress                                       n n_deaths_na  deaths
##   <chr>     <chr>                                      <int>       <int>   <dbl>
## 1 2015-2019 <NA>                                      153186           0  5.59e7
## 2 2020      Suppressed (counts highly incomplete, <5~     69          69  0.    
## 3 2020      <NA>                                       29351           0  1.24e7
## `summarise()` regrouping output by 'period' (override with `.groups` argument)
## # A tibble: 9 x 5
## # Groups:   period [2]
##   period   Note                                            n n_deaths_na  deaths
##   <chr>    <chr>                                       <int>       <int>   <dbl>
## 1 2015-20~ <NA>                                       153186           0  5.59e7
## 2 2020     Data in recent weeks are incomplete. Only~  23560          16  1.06e7
## 3 2020     Data in recent weeks are incomplete. Only~    492           0  2.33e5
## 4 2020     Data in recent weeks are incomplete. Only~    157           5  1.07e4
## 5 2020     Data in recent weeks are incomplete. Only~   2789          48  6.52e5
## 6 2020     Data in recent weeks are incomplete. Only~     60           0  3.19e4
## 7 2020     Estimates for Pennsylvania are too low fo~     48           0  2.26e4
## 8 2020     Weights may be too low to account for und~    200           0  6.85e4
## 9 2020     <NA>                                         2114           0  8.70e5
## `summarise()` regrouping output by 'state' (override with `.groups` argument)
##    state         Jurisdiction    n n_deaths_na   deaths
## 1     US        United States 3720           0 34069833
## 2     CA           California 3720           0  3237436
## 3     FL              Florida 3720           0  2485669
## 4     TX                Texas 3720           0  2457802
## 5     PA         Pennsylvania 3720           0  1635094
## 6     OH                 Ohio 3720           0  1477003
## 7     IL             Illinois 3720           0  1291497
## 8     NY             New York 3720           0  1214185
## 9     MI             Michigan 3720           0  1175121
## 10    NC       North Carolina 3616          40  1077728
## 11    GA              Georgia 3719           0  1021638
## 12    NJ           New Jersey 3713           0   909904
## 13    TN            Tennessee 3720           0   892157
## 14    VA             Virginia 3720           0   817580
## 15    IN              Indiana 3713           0   793650
## 16    MO             Missouri 3718           0   773467
## 17    AZ              Arizona 3720           0   721877
## 18    MA        Massachusetts 3682           0   718016
## 19    YC        New York City 3716           0   700147
## 20    WA           Washington 3720           0   679310
## 21    AL              Alabama 3717           0   632762
## 22    WI            Wisconsin 3701           0   630622
## 23    MD             Maryland 3714           0   601409
## 24    SC       South Carolina 3717           0   592616
## 25    KY             Kentucky 3682           0   576726
## 26    LA            Louisiana 3715           0   555718
## 27    MN            Minnesota 3674           0   534603
## 28    CO             Colorado 3718           0   473173
## 29    OK             Oklahoma 3702           0   470966
## 30    OR               Oregon 3545           0   435678
## 31    MS          Mississippi 3656           0   385285
## 32    AR             Arkansas 3610           0   384066
## 33    CT          Connecticut 3265          16   373500
## 34    IA                 Iowa 3348           0   361786
## 35    PR          Puerto Rico 3415           0   348271
## 36    KS               Kansas 3404           0   315132
## 37    NV               Nevada 3454           0   306317
## 38    WV        West Virginia 3148          13   262476
## 39    UT                 Utah 3604           0   226553
## 40    NM           New Mexico 3280           0   217208
## 41    NE             Nebraska 2989           0   200493
## 42    ME                Maine 2778           0   168870
## 43    ID                Idaho 2906           0   162354
## 44    NH        New Hampshire 2800           0   142220
## 45    HI               Hawaii 2684           0   130801
## 46    RI         Rhode Island 2594           0   120084
## 47    MT              Montana 2687           0   117305
## 48    DE             Delaware 2688           0   104851
## 49    SD         South Dakota 2572           0    93020
## 50    ND         North Dakota 2555           0    80901
## 51    DC District of Columbia 2672           0    67403
## 52    VT              Vermont 2458           0    65197
## 53    WY              Wyoming 2435           0    50392
## 54    AK               Alaska 2482           0    44953
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## Rows: 182,606
## Columns: 11
## $ Jurisdiction <chr> "Alabama", "Alabama", "Alabama", "Alabama", "Alabama",...
## $ weekEnding   <date> 2015-01-10, 2015-01-17, 2015-01-24, 2015-01-31, 2015-...
## $ state        <chr> "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", ...
## $ year         <fct> 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015, ...
## $ week         <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,...
## $ age          <fct> 25-44 years, 25-44 years, 25-44 years, 25-44 years, 25...
## $ deaths       <dbl> 67, 49, 55, 59, 47, 59, 41, 47, 59, 57, 54, 50, 58, 42...
## $ period       <fct> 2015-2019, 2015-2019, 2015-2019, 2015-2019, 2015-2019,...
## $ type         <chr> "Predicted (weighted)", "Predicted (weighted)", "Predi...
## $ Suppress     <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## $ Note         <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## Rows: 89,054
## Columns: 11
## $ Jurisdiction <chr> "Alabama", "Alabama", "Alabama", "Alabama", "Alabama",...
## $ weekEnding   <date> 2015-01-10, 2015-01-17, 2015-01-24, 2015-01-31, 2015-...
## $ state        <chr> "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", ...
## $ year         <fct> 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015, ...
## $ week         <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,...
## $ age          <fct> 25-44 years, 25-44 years, 25-44 years, 25-44 years, 25...
## $ deaths       <dbl> 67, 49, 55, 59, 47, 59, 41, 47, 59, 57, 54, 50, 58, 42...
## $ period       <fct> 2015-2019, 2015-2019, 2015-2019, 2015-2019, 2015-2019,...
## $ type         <chr> "Predicted (weighted)", "Predicted (weighted)", "Predi...
## $ Suppress     <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## $ Note         <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## 
## 
##  *** Data suppression checks *** 
## # A tibble: 18 x 11
##    Jurisdiction weekEnding state year   week age   deaths period type  Suppress
##    <chr>        <date>     <chr> <fct> <int> <fct>  <dbl> <fct>  <chr> <chr>   
##  1 North Carol~ 2020-09-26 NC    2020     39 25-4~     NA 2020   Pred~ Suppres~
##  2 North Carol~ 2020-10-03 NC    2020     40 25-4~     NA 2020   Pred~ Suppres~
##  3 North Carol~ 2020-09-26 NC    2020     39 45-6~     NA 2020   Pred~ Suppres~
##  4 North Carol~ 2020-10-03 NC    2020     40 45-6~     NA 2020   Pred~ Suppres~
##  5 North Carol~ 2020-10-10 NC    2020     41 45-6~     NA 2020   Pred~ Suppres~
##  6 North Carol~ 2020-10-17 NC    2020     42 45-6~     NA 2020   Pred~ Suppres~
##  7 North Carol~ 2020-09-26 NC    2020     39 65-7~     NA 2020   Pred~ Suppres~
##  8 North Carol~ 2020-10-03 NC    2020     40 65-7~     NA 2020   Pred~ Suppres~
##  9 North Carol~ 2020-10-10 NC    2020     41 65-7~     NA 2020   Pred~ Suppres~
## 10 North Carol~ 2020-09-26 NC    2020     39 75-8~     NA 2020   Pred~ Suppres~
## 11 North Carol~ 2020-10-03 NC    2020     40 75-8~     NA 2020   Pred~ Suppres~
## 12 North Carol~ 2020-10-10 NC    2020     41 75-8~     NA 2020   Pred~ Suppres~
## 13 North Carol~ 2020-10-17 NC    2020     42 75-8~     NA 2020   Pred~ Suppres~
## 14 North Carol~ 2020-09-26 NC    2020     39 85 y~     NA 2020   Pred~ Suppres~
## 15 North Carol~ 2020-10-03 NC    2020     40 85 y~     NA 2020   Pred~ Suppres~
## 16 North Carol~ 2020-10-10 NC    2020     41 85 y~     NA 2020   Pred~ Suppres~
## 17 North Carol~ 2020-10-17 NC    2020     42 85 y~     NA 2020   Pred~ Suppres~
## 18 North Carol~ 2020-09-26 NC    2020     39 Unde~     NA 2020   Pred~ Suppres~
## # ... with 1 more variable: Note <chr>
## 
##  *** Data suppression checks failed - total of 18 suppressions
##  *** Of these suppressions, 15 are NOT from weekThru of current year
## Continuing since all states with problems are in stateNoCheck
## `summarise()` regrouping output by 'Jurisdiction', 'weekEnding', 'state', 'year', 'week', 'age', 'period', 'type' (override with `.groups` argument)
## Rows: 83,760
## Columns: 12
## $ Jurisdiction <chr> "Alabama", "Alabama", "Alabama", "Alabama", "Alabama",...
## $ weekEnding   <date> 2015-01-10, 2015-01-10, 2015-01-10, 2015-01-10, 2015-...
## $ state        <chr> "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", ...
## $ year         <fct> 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015, ...
## $ week         <int> 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, ...
## $ age          <fct> Under 25 years, 25-44 years, 45-64 years, 65-74 years,...
## $ period       <fct> 2015-2019, 2015-2019, 2015-2019, 2015-2019, 2015-2019,...
## $ type         <chr> "Predicted (weighted)", "Predicted (weighted)", "Predi...
## $ Suppress     <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## $ n            <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
## $ deaths       <dbl> 25, 67, 253, 202, 272, 320, 28, 49, 256, 222, 253, 332...
## $ Note         <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## 
## First duplicate is in row number (0 means no duplicates): 0
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` regrouping output by 'cluster' (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)

## `summarise()` regrouping output by 'year' (override with `.groups` argument)

## `summarise()` regrouping output by 'year', 'week' (override with `.groups` argument)

## `summarise()` regrouping output by 'year', 'week' (override with `.groups` argument)

## `summarise()` regrouping output by 'year', 'age', 'week' (override with `.groups` argument)

## 
## Plots will be run after excluding stateNoCheck states
## `summarise()` regrouping output by 'year' (override with `.groups` argument)

## `summarise()` ungrouping output (override with `.groups` argument)

## `summarise()` regrouping output by 'year' (override with `.groups` argument)

## `summarise()` regrouping output by 'year' (override with `.groups` argument)

## `summarise()` regrouping output by 'year' (override with `.groups` argument)

## `summarise()` regrouping output by 'year' (override with `.groups` argument)

## `summarise()` regrouping output by 'year' (override with `.groups` argument)

## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'state', 'quarter', 'month' (override with `.groups` argument)
## `summarise()` regrouping output by 'state' (override with `.groups` argument)

## `summarise()` regrouping output by 'state' (override with `.groups` argument)

## `summarise()` ungrouping output (override with `.groups` argument)
## Joining, by = "state"

## `summarise()` regrouping output by 'year' (override with `.groups` argument)

## `summarise()` regrouping output by 'year' (override with `.groups` argument)

## `summarise()` regrouping output by 'year' (override with `.groups` argument)

## `summarise()` regrouping output by 'year' (override with `.groups` argument)

## `summarise()` regrouping output by 'year' (override with `.groups` argument)

## `summarise()` regrouping output by 'year' (override with `.groups` argument)

## `summarise()` regrouping output by 'age', 'quarter', 'month' (override with `.groups` argument)
## `summarise()` regrouping output by 'age' (override with `.groups` argument)

## `summarise()` ungrouping output (override with `.groups` argument)

saveToRDS(cdcList_20201226, ovrWriteError=FALSE)
## 
## File already exists: ./RInputFiles/Coronavirus/cdcList_20201226.RDS 
## 
## Not replacing the existing file since ovrWrite=FALSE
## NULL

The updated comparison of deaths in the raw and processed USA Facts files is performed:

deathLoc <- "./RInputFiles/Coronavirus/covid_deaths_usafacts_downloaded_20201225.csv"
rawCountyDeath <- readr::read_csv(deathLoc) 
## 
## -- Column specification --------------------------------------------------------
## cols(
##   .default = col_double(),
##   `County Name` = col_character(),
##   State = col_character()
## )
## i Use `spec()` for the full column specifications.
allCountyDeath <- rawCountyDeath %>%
    pivot_longer(-c(countyFIPS, `County Name`, State, stateFIPS), names_to="date", values_to="deaths") %>%
    mutate(date=lubridate::mdy(date), 
           countyFIPS=stringr::str_pad(countyFIPS, side="left", width=5, pad="0")
           ) %>%
    filter(date==max(date))
allCountyDeath
## # A tibble: 3,195 x 6
##    countyFIPS `County Name`         State stateFIPS date       deaths
##    <chr>      <chr>                 <chr>     <dbl> <date>      <dbl>
##  1 00000      Statewide Unallocated AL            1 2020-12-23      0
##  2 01001      Autauga County        AL            1 2020-12-23     46
##  3 01003      Baldwin County        AL            1 2020-12-23    151
##  4 01005      Barbour County        AL            1 2020-12-23     32
##  5 01007      Bibb County           AL            1 2020-12-23     42
##  6 01009      Blount County         AL            1 2020-12-23     57
##  7 01011      Bullock County        AL            1 2020-12-23     20
##  8 01013      Butler County         AL            1 2020-12-23     45
##  9 01015      Calhoun County        AL            1 2020-12-23    145
## 10 01017      Chambers County       AL            1 2020-12-23     61
## # ... with 3,185 more rows
countyAll <- cty_old_20201225$burdenData %>% 
    group_by(county, countyFIPS) %>% 
    summarize(pop=max(population, na.rm=TRUE), 
              cases=max(cumCases, na.rm=TRUE), 
              deaths=max(cumDeaths, na.rm=TRUE), 
              .groups="drop"
              ) %>%
    mutate(countyFIPS=stringr::str_pad(countyFIPS, width=5, side="left", pad="0"))
countyAll
## # A tibble: 3,142 x 5
##    county             countyFIPS    pop cases deaths
##    <chr>              <chr>       <dbl> <dbl>  <dbl>
##  1 Abbeville (SC)     45001       24527  1204     23
##  2 Acadia Parish (LA) 22001       62045  4816    136
##  3 Accomack (VA)      51001       32316  1604     25
##  4 Ada (ID)           16001      481587 35094    320
##  5 Adair (IA)         19001        7152   555     17
##  6 Adair (KY)         21001       19202  1173     39
##  7 Adair (MO)         29001       25343  1512      4
##  8 Adair (OK)         40001       22194  1729     13
##  9 Adams (CO)         08001      517421 37934    480
## 10 Adams (IA)         19003        3602   241      2
## # ... with 3,132 more rows
# Records in the processed deaths file but not in the raw file (should be empty)
countyAll %>%
    anti_join(allCountyDeath, by="countyFIPS")
## # A tibble: 0 x 5
## # ... with 5 variables: county <chr>, countyFIPS <chr>, pop <dbl>, cases <dbl>,
## #   deaths <dbl>
# Records in the raw deaths file but not in the processed deaths file
mismatch_1 <- allCountyDeath %>%
    anti_join(countyAll, by="countyFIPS")
mismatch_1 %>%
    filter(deaths > 0) %>%
    arrange(-deaths) %>%
    as.data.frame()
##    countyFIPS                        County Name State stateFIPS       date
## 1       00000              Statewide Unallocated    KS        20 2020-12-23
## 2       00000              Statewide Unallocated    GA        13 2020-12-23
## 3       00000              Statewide Unallocated    NY        36 2020-12-23
## 4       00000              Statewide Unallocated    TN        47 2020-12-23
## 5       00000              Statewide Unallocated    MI        26 2020-12-23
## 6       00000              Statewide Unallocated    MD        24 2020-12-23
## 7       00001 New York City Unallocated/Probable    NY        36 2020-12-23
## 8       00000              Statewide Unallocated    TX        48 2020-12-23
## 9       00000              Statewide Unallocated    RI        44 2020-12-23
## 10      00000              Statewide Unallocated    MA        25 2020-12-23
## 11      00000              Statewide Unallocated    UT        49 2020-12-23
## 12      00000              Statewide Unallocated    WA        53 2020-12-23
## 13      00000              Statewide Unallocated    HI        15 2020-12-23
## 14      00000              Statewide Unallocated    MT        30 2020-12-23
## 15      00000              Statewide Unallocated    MN        27 2020-12-23
## 16      00000              Statewide Unallocated    NE        31 2020-12-23
##    deaths
## 1     302
## 2     247
## 3     147
## 4     131
## 5     129
## 6      67
## 7      60
## 8      14
## 9       7
## 10      6
## 11      6
## 12      3
## 13      2
## 14      2
## 15      1
## 16      1
mismatch_1 %>%
    mutate(unalloc=ifelse(stringr::str_detect(string=`County Name`, pattern="nallocated"), 
                          "unallocated", 
                          "specific"
                          )
           ) %>%
    filter(deaths > 0) %>%
    pivot_wider(id_cols=c(State), 
                names_from=unalloc, 
                values_from=deaths, 
                values_fn=sum, 
                values_fill=0
                ) %>%
    arrange(-unallocated)
## # A tibble: 15 x 2
##    State unallocated
##    <chr>       <dbl>
##  1 KS            302
##  2 GA            247
##  3 NY            207
##  4 TN            131
##  5 MI            129
##  6 MD             67
##  7 TX             14
##  8 RI              7
##  9 MA              6
## 10 UT              6
## 11 WA              3
## 12 HI              2
## 13 MT              2
## 14 MN              1
## 15 NE              1

As expected, the only differences are in the unallocated deaths by state, with Kansas, Georgia, and NYC being main contributors to this category.

State-level totals are compared between USA Facts and COVID Tracking project using the latest data:

usaFacts <- cty_old_20201225$clusterStateData %>% 
    filter(date <= as.Date("2020-12-23")) %>% 
    group_by(state) %>%
    summarize(cases=sum(cases, na.rm=TRUE), deaths=sum(deaths), .groups="drop")

ctp <- old_hier5_201224$plotData %>% 
    filter(date <= as.Date("2020-12-23")) %>% 
    group_by(state) %>% 
    summarize(cases=sum(cases), deaths=sum(deaths), .groups="drop")

usaCTP <- usaFacts %>%
    bind_rows(ctp, .id="source") %>%
    mutate(source=c('1'="USA Facts", '2'="COVID Tracking Project")[source]) %>%
    pivot_longer(-c(source, state), names_to="metric", values_to="value")

# Plot percentage difference by metric and state
usaCTP %>%
    pivot_wider(c(state, metric), names_from="source", values_from="value") %>%
    mutate(pct=`COVID Tracking Project`/`USA Facts`) %>%
    ggplot(aes(x=fct_reorder(state, abs(pct-1), .fun=max), y=pct)) + 
    geom_point() + 
    coord_flip() +
    labs(x="", 
         y="COVID Tracking Project as % of USA Facts", 
         title="Comparison of COVID Tracking Project and USA Facts", 
         subtitle="Data as of 23-DEC-2020"
         ) +
    geom_hline(yintercept=1, lty=2) +
    facet_wrap(~metric)

# Plot percentage difference by metric and state
usaCTP %>%
    pivot_wider(c(state, metric), names_from="source", values_from="value") %>%
    mutate(pct=`COVID Tracking Project`/`USA Facts`) %>%
    group_by(state) %>%
    filter(max(abs(pct-1))>=0.025) %>%
    ggplot(aes(x=fct_reorder(state, abs(pct-1), .fun=max), y=pct)) + 
    geom_text(aes(label=paste0(round(100*pct, 1), "%")), size=3) + 
    coord_flip() +
    labs(x="", 
         y="COVID Tracking Project as % of USA Facts", 
         title="Comparison of COVID Tracking Project and USA Facts", 
         subtitle="Data as of 23-DEC-2020 (filtered to only states at least 2.5% different on one metric)"
         ) +
    geom_hline(yintercept=1, lty=2) +
    facet_wrap(~metric)

The states with mismatches appear similar to previous, though the previous issue with Virginia has been resolved and some metrics such as Iowa deaths appear to now be well aligned.

The comparison is extended:

usaCTP %>%
    filter(metric=="deaths") %>%
    pivot_wider(state, names_from="source", values_from="value") %>%
    mutate(ctpMinusUSA=`COVID Tracking Project`-`USA Facts`, 
           absDiff=abs(ctpMinusUSA), 
           signDiff=ctpMinusUSA>=0
           ) %>%
    ggplot(aes(x=fct_reorder(state, absDiff), y=absDiff)) + 
    geom_text(aes(y=absDiff+100, label=absDiff), hjust=0, size=3) +
    geom_col(aes(fill=signDiff)) + 
    scale_fill_discrete("CTP >= USAF") +
    coord_flip() + 
    labs(x="", 
         y="Absolute Value of Difference in Reported Deaths", 
         title="Difference in Reported Deaths (COVID Tracking Project vs USA Facts)"
         )

usaCTP %>%
    filter(metric=="cases") %>%
    pivot_wider(state, names_from="source", values_from="value") %>%
    mutate(ctpMinusUSA=`COVID Tracking Project`-`USA Facts`, 
           absDiff=abs(ctpMinusUSA), 
           signDiff=ctpMinusUSA>=0
           ) %>%
    ggplot(aes(x=fct_reorder(state, absDiff), y=absDiff)) + 
    geom_text(aes(y=absDiff+100, label=paste0(round(absDiff/1000, 1), "k")), hjust=0, size=3) +
    geom_col(aes(fill=signDiff)) + 
    scale_fill_discrete("CTP >= USAF") +
    coord_flip() + 
    labs(x="", 
         y="Absolute Value of Difference in Reported Cases", 
         title="Difference in Reported Cases (COVID Tracking Project vs USA Facts)"
         )

There are still some differences between the data sources that may be interesting to explore further.

Several of the states with larger differences are explored:

exploreState("KS", ctp=old_hier5_201224, usa=cty_old_20201225)

## # A tibble: 1,260 x 5
## # Groups:   source, metric [4]
##    date       source                 metric value cumValue
##    <date>     <chr>                  <chr>  <dbl>    <dbl>
##  1 2020-03-06 COVID Tracking Project cases      0        0
##  2 2020-03-07 COVID Tracking Project cases      0        0
##  3 2020-03-08 COVID Tracking Project cases      1        1
##  4 2020-03-09 COVID Tracking Project cases      0        1
##  5 2020-03-10 COVID Tracking Project cases      0        1
##  6 2020-03-11 COVID Tracking Project cases      0        1
##  7 2020-03-12 COVID Tracking Project cases      3        4
##  8 2020-03-13 COVID Tracking Project cases      2        6
##  9 2020-03-14 COVID Tracking Project cases      0        6
## 10 2020-03-15 COVID Tracking Project cases      2        8
## # ... with 1,250 more rows
exploreState("NY", ctp=old_hier5_201224, usa=cty_old_20201225)

## # A tibble: 1,268 x 5
## # Groups:   source, metric [4]
##    date       source                 metric value cumValue
##    <date>     <chr>                  <chr>  <dbl>    <dbl>
##  1 2020-03-02 COVID Tracking Project cases      0        0
##  2 2020-03-03 COVID Tracking Project cases      1        1
##  3 2020-03-04 COVID Tracking Project cases      0        1
##  4 2020-03-05 COVID Tracking Project cases      2        3
##  5 2020-03-06 COVID Tracking Project cases     22       25
##  6 2020-03-07 COVID Tracking Project cases     11       36
##  7 2020-03-08 COVID Tracking Project cases     24       60
##  8 2020-03-09 COVID Tracking Project cases     28       88
##  9 2020-03-10 COVID Tracking Project cases     63      151
## 10 2020-03-11 COVID Tracking Project cases     44      195
## # ... with 1,258 more rows
exploreState("IA", ctp=old_hier5_201224, usa=cty_old_20201225)

## # A tibble: 1,260 x 5
## # Groups:   source, metric [4]
##    date       source                 metric value cumValue
##    <date>     <chr>                  <chr>  <dbl>    <dbl>
##  1 2020-03-06 COVID Tracking Project cases      0        0
##  2 2020-03-07 COVID Tracking Project cases      0        0
##  3 2020-03-08 COVID Tracking Project cases      0        0
##  4 2020-03-09 COVID Tracking Project cases      3        3
##  5 2020-03-10 COVID Tracking Project cases      5        8
##  6 2020-03-11 COVID Tracking Project cases      5       13
##  7 2020-03-12 COVID Tracking Project cases      1       14
##  8 2020-03-13 COVID Tracking Project cases      2       16
##  9 2020-03-14 COVID Tracking Project cases      1       17
## 10 2020-03-15 COVID Tracking Project cases      1       18
## # ... with 1,250 more rows
exploreState("GA", ctp=old_hier5_201224, usa=cty_old_20201225)

## # A tibble: 1,264 x 5
## # Groups:   source, metric [4]
##    date       source                 metric value cumValue
##    <date>     <chr>                  <chr>  <dbl>    <dbl>
##  1 2020-03-04 COVID Tracking Project cases      0        0
##  2 2020-03-05 COVID Tracking Project cases      0        0
##  3 2020-03-06 COVID Tracking Project cases      0        0
##  4 2020-03-07 COVID Tracking Project cases      4        4
##  5 2020-03-08 COVID Tracking Project cases      1        5
##  6 2020-03-09 COVID Tracking Project cases      5       10
##  7 2020-03-10 COVID Tracking Project cases      5       15
##  8 2020-03-11 COVID Tracking Project cases      5       20
##  9 2020-03-12 COVID Tracking Project cases      9       29
## 10 2020-03-13 COVID Tracking Project cases     11       40
## # ... with 1,254 more rows
exploreState("IL", ctp=old_hier5_201224, usa=cty_old_20201225)

## # A tibble: 1,264 x 5
## # Groups:   source, metric [4]
##    date       source                 metric value cumValue
##    <date>     <chr>                  <chr>  <dbl>    <dbl>
##  1 2020-03-04 COVID Tracking Project cases      0        0
##  2 2020-03-05 COVID Tracking Project cases      1        1
##  3 2020-03-06 COVID Tracking Project cases      0        1
##  4 2020-03-07 COVID Tracking Project cases      1        2
##  5 2020-03-08 COVID Tracking Project cases      0        2
##  6 2020-03-09 COVID Tracking Project cases      1        3
##  7 2020-03-10 COVID Tracking Project cases     12       15
##  8 2020-03-11 COVID Tracking Project cases      0       15
##  9 2020-03-12 COVID Tracking Project cases      6       21
## 10 2020-03-13 COVID Tracking Project cases      7       28
## # ... with 1,254 more rows
exploreState("RI", ctp=old_hier5_201224, usa=cty_old_20201225)

## # A tibble: 1,270 x 5
## # Groups:   source, metric [4]
##    date       source                 metric value cumValue
##    <date>     <chr>                  <chr>  <dbl>    <dbl>
##  1 2020-03-01 COVID Tracking Project cases      0        0
##  2 2020-03-02 COVID Tracking Project cases      0        0
##  3 2020-03-03 COVID Tracking Project cases      0        0
##  4 2020-03-04 COVID Tracking Project cases      0        0
##  5 2020-03-05 COVID Tracking Project cases      1        1
##  6 2020-03-06 COVID Tracking Project cases      0        1
##  7 2020-03-07 COVID Tracking Project cases      0        1
##  8 2020-03-08 COVID Tracking Project cases      0        1
##  9 2020-03-09 COVID Tracking Project cases      1        2
## 10 2020-03-10 COVID Tracking Project cases      1        3
## # ... with 1,260 more rows
exploreState("HI", ctp=old_hier5_201224, usa=cty_old_20201225)

## # A tibble: 1,264 x 5
## # Groups:   source, metric [4]
##    date       source                 metric value cumValue
##    <date>     <chr>                  <chr>  <dbl>    <dbl>
##  1 2020-03-04 COVID Tracking Project cases      0        0
##  2 2020-03-05 COVID Tracking Project cases      0        0
##  3 2020-03-06 COVID Tracking Project cases      0        0
##  4 2020-03-07 COVID Tracking Project cases      1        1
##  5 2020-03-08 COVID Tracking Project cases      0        1
##  6 2020-03-09 COVID Tracking Project cases      1        2
##  7 2020-03-10 COVID Tracking Project cases      0        2
##  8 2020-03-11 COVID Tracking Project cases      0        2
##  9 2020-03-12 COVID Tracking Project cases      0        2
## 10 2020-03-13 COVID Tracking Project cases      0        2
## # ... with 1,254 more rows
  • KS - cases are well aligned between sources; deaths were well aligned until recently when there appears to be a lag in USA Facts driven by deaths not yet allocated to counties
  • NY - as noted previously, LTC death tracking and probable vs. confirmed death tracking in NYC drive large differences in reported numbers of deaths
  • IA - cases appear to be tracked higher in USA Facts with the divergence starting around September 2020
  • GA - COVID Tracking Project show a trend break for deaths around November 2020 that may be interesting to explore further
  • IL - the data are reasonably similar in trend and magnitude
  • RI - data are more chunky in USA Facts and appear to lag COVID Tracking Project until a chunk “catched up” to where CTP was a week or so previously
  • HI - data are very well aligned, with the exception of what appears to be an ~10 day lag where USA Facts have not yet captured the latest county-level data

The segmenting process is run using the US census regions, available from state.division (DC is assigned as ‘South Atlantic’ so that it maps with Maryland and Virginia):

# Create census division clusters
censusClusters <- c(as.character(state.division), "South Atlantic")
names(censusClusters) <- c(state.abb, "DC")
censusClusters
##                   AL                   AK                   AZ 
## "East South Central"            "Pacific"           "Mountain" 
##                   AR                   CA                   CO 
## "West South Central"            "Pacific"           "Mountain" 
##                   CT                   DE                   FL 
##        "New England"     "South Atlantic"     "South Atlantic" 
##                   GA                   HI                   ID 
##     "South Atlantic"            "Pacific"           "Mountain" 
##                   IL                   IN                   IA 
## "East North Central" "East North Central" "West North Central" 
##                   KS                   KY                   LA 
## "West North Central" "East South Central" "West South Central" 
##                   ME                   MD                   MA 
##        "New England"     "South Atlantic"        "New England" 
##                   MI                   MN                   MS 
## "East North Central" "West North Central" "East South Central" 
##                   MO                   MT                   NE 
## "West North Central"           "Mountain" "West North Central" 
##                   NV                   NH                   NJ 
##           "Mountain"        "New England"    "Middle Atlantic" 
##                   NM                   NY                   NC 
##           "Mountain"    "Middle Atlantic"     "South Atlantic" 
##                   ND                   OH                   OK 
## "West North Central" "East North Central" "West South Central" 
##                   OR                   PA                   RI 
##            "Pacific"    "Middle Atlantic"        "New England" 
##                   SC                   SD                   TN 
##     "South Atlantic" "West North Central" "East South Central" 
##                   TX                   UT                   VT 
## "West South Central"           "Mountain"        "New England" 
##                   VA                   WA                   WV 
##     "South Atlantic"            "Pacific"     "South Atlantic" 
##                   WI                   WY                   DC 
## "East North Central"           "Mountain"     "South Atlantic"
# Use existing segments with updated data
locDownload <- "./RInputFiles/Coronavirus/CV_downloaded_201214.csv"
ctp_census_201214 <- readRunCOVIDTrackingProject(thruLabel="Dec 13, 2020", 
                                                 downloadTo=if(file.exists(locDownload)) NULL else locDownload,
                                                 readFrom=locDownload, 
                                                 compareFile=readFromRDS("test_hier5_201025")$dfRaw,
                                                 useClusters=censusClusters
                                                 )
## 
## -- Column specification --------------------------------------------------------
## cols(
##   .default = col_double(),
##   state = col_character(),
##   totalTestResultsSource = col_character(),
##   dataQualityGrade = col_character(),
##   lastUpdateEt = col_character(),
##   dateModified = col_datetime(format = ""),
##   checkTimeEt = col_character(),
##   dateChecked = col_datetime(format = ""),
##   fips = col_character(),
##   hash = col_character(),
##   grade = col_logical()
## )
## i Use `spec()` for the full column specifications.
## 
## File is unique by state and date
## 
## 
## Overall control totals in file:
## # A tibble: 1 x 3
##   positiveIncrease deathIncrease hospitalizedCurrently
##              <dbl>         <dbl>                 <dbl>
## 1         16339303        292404              12655613
## 
## *** COMPARISONS TO REFERENCE FILE: compareFile
## 
## Checkin for similarity of: column names
## In reference but not in current: 
## In current but not in reference: 
## 
## Checkin for similarity of: states
## In reference but not in current: 
## In current but not in reference: 
## 
## Checkin for similarity of: dates
## In reference but not in current: 
## In current but not in reference: 2020-12-14 2020-12-13 2020-12-12 2020-12-11 2020-12-10 2020-12-09 2020-12-08 2020-12-07 2020-12-06 2020-12-05 2020-12-04 2020-12-03 2020-12-02 2020-12-01 2020-11-30 2020-11-29 2020-11-28 2020-11-27 2020-11-26 2020-11-25 2020-11-24 2020-11-23 2020-11-22 2020-11-21 2020-11-20 2020-11-19 2020-11-18 2020-11-17 2020-11-16 2020-11-15 2020-11-14 2020-11-13 2020-11-12 2020-11-11 2020-11-10 2020-11-09 2020-11-08 2020-11-07 2020-11-06 2020-11-05 2020-11-04 2020-11-03 2020-11-02 2020-11-01 2020-10-31 2020-10-30 2020-10-29 2020-10-28 2020-10-27 2020-10-26 2020-10-25
## 
## *** Difference of at least 5 and difference is at least 1%:
## Joining, by = c("date", "name")
##           date                  name newValue oldValue
## 1   2020-03-05      positiveIncrease       86      103
## 2   2020-03-06      positiveIncrease      128      109
## 3   2020-03-07      positiveIncrease      129      176
## 4   2020-03-10      positiveIncrease      441      387
## 5   2020-03-11      positiveIncrease      497      509
## 6   2020-03-12      positiveIncrease      745      671
## 7   2020-03-13      positiveIncrease      933     1072
## 8   2020-03-14      positiveIncrease      970      924
## 9   2020-03-15      positiveIncrease     1217     1291
## 10  2020-03-16      positiveIncrease     1847     1739
## 11  2020-03-17      positiveIncrease     2249     2588
## 12  2020-03-18      positiveIncrease     3364     3089
## 13  2020-03-21 hospitalizedCurrently     1492     1436
## 14  2020-03-23 hospitalizedCurrently     2812     2770
## 15  2020-03-24      positiveIncrease    11116    10769
## 16  2020-03-25      positiveIncrease    12590    12908
## 17  2020-03-25 hospitalizedCurrently     5140     5062
## 18  2020-03-28      positiveIncrease    19602    19925
## 19  2020-03-28         deathIncrease      551      544
## 20  2020-03-29         deathIncrease      504      515
## 21  2020-03-30      positiveIncrease    21467    22042
## 22  2020-03-31      positiveIncrease    25187    24853
## 23  2020-03-31         deathIncrease      907      890
## 24  2020-04-01      positiveIncrease    26115    25791
## 25  2020-04-06      positiveIncrease    28425    29002
## 26  2020-04-09      positiveIncrease    35090    34503
## 27  2020-04-10      positiveIncrease    33489    34380
## 28  2020-04-10         deathIncrease     2072     2108
## 29  2020-04-11      positiveIncrease    31105    30501
## 30  2020-04-11         deathIncrease     2079     2054
## 31  2020-04-13      positiveIncrease    24398    25195
## 32  2020-04-14      positiveIncrease    26078    25719
## 33  2020-04-15      positiveIncrease    29859    30307
## 34  2020-04-16      positiveIncrease    31577    30978
## 35  2020-04-23         deathIncrease     1814     1791
## 36  2020-04-24         deathIncrease     1972     1895
## 37  2020-04-25         deathIncrease     1627     1748
## 38  2020-04-27         deathIncrease     1287     1270
## 39  2020-04-29         deathIncrease     2685     2713
## 40  2020-05-01         deathIncrease     1808     1779
## 41  2020-05-02         deathIncrease     1531     1562
## 42  2020-05-05         deathIncrease     2494     2452
## 43  2020-05-06         deathIncrease     1916     1948
## 44  2020-05-07      positiveIncrease    27227    27537
## 45  2020-05-12      positiveIncrease    22558    22890
## 46  2020-05-12         deathIncrease     1506     1486
## 47  2020-05-13      positiveIncrease    21628    21285
## 48  2020-05-13         deathIncrease     1734     1704
## 49  2020-05-14         deathIncrease     1852     1879
## 50  2020-05-15      positiveIncrease    25422    24685
## 51  2020-05-15         deathIncrease     1535     1507
## 52  2020-05-16      positiveIncrease    23586    24702
## 53  2020-05-16         deathIncrease     1237      987
## 54  2020-05-17         deathIncrease      873      849
## 55  2020-05-21         deathIncrease     1377     1394
## 56  2020-05-22      positiveIncrease    24173    24433
## 57  2020-05-22         deathIncrease     1291     1341
## 58  2020-05-23      positiveIncrease    22365    21531
## 59  2020-05-23         deathIncrease     1038     1063
## 60  2020-05-24      positiveIncrease    18860    20072
## 61  2020-05-24         deathIncrease      689      680
## 62  2020-05-26         deathIncrease      665      645
## 63  2020-05-27         deathIncrease     1335     1321
## 64  2020-05-29         deathIncrease     1171     1184
## 65  2020-05-30      positiveIncrease    23437    23682
## 66  2020-06-01         deathIncrease      680      668
## 67  2020-06-02         deathIncrease      973      962
## 68  2020-06-03      positiveIncrease    20155    20390
## 69  2020-06-03         deathIncrease      974      993
## 70  2020-06-04      positiveIncrease    20383    20886
## 71  2020-06-04         deathIncrease      881      893
## 72  2020-06-05      positiveIncrease    23066    23394
## 73  2020-06-05         deathIncrease      837      826
## 74  2020-06-06      positiveIncrease    22558    23064
## 75  2020-06-06         deathIncrease      714      728
## 76  2020-06-08         deathIncrease      674      661
## 77  2020-06-09         deathIncrease      891      902
## 78  2020-06-12      positiveIncrease    23096    23597
## 79  2020-06-12         deathIncrease      766      775
## 80  2020-06-15         deathIncrease      387      381
## 81  2020-06-16         deathIncrease      718      730
## 82  2020-06-17         deathIncrease      779      767
## 83  2020-06-18      positiveIncrease    27089    27746
## 84  2020-06-18         deathIncrease      685      705
## 85  2020-06-19      positiveIncrease    30959    31471
## 86  2020-06-20      positiveIncrease    31951    32294
## 87  2020-06-20         deathIncrease      615      629
## 88  2020-06-21      positiveIncrease    28848    27928
## 89  2020-06-23      positiveIncrease    33885    33447
## 90  2020-06-23         deathIncrease      722      710
## 91  2020-06-24         deathIncrease      707      724
## 92  2020-06-26         deathIncrease      621      637
## 93  2020-06-27         deathIncrease      503      511
## 94  2020-06-29         deathIncrease      338      332
## 95  2020-06-30         deathIncrease      580      596
## 96  2020-07-02      positiveIncrease    53508    54085
## 97  2020-07-04         deathIncrease      300      306
## 98  2020-07-06      positiveIncrease    41494    41959
## 99  2020-07-06         deathIncrease      235      243
## 100 2020-07-07         deathIncrease      910      923
## 101 2020-07-08         deathIncrease      818      807
## 102 2020-07-10         deathIncrease      835      854
## 103 2020-07-14      positiveIncrease    59250    62687
## 104 2020-07-14         deathIncrease      745      736
## 105 2020-07-15      positiveIncrease    69101    65797
## 106 2020-07-20         deathIncrease      375      363
## 107 2020-07-22         deathIncrease     1142     1171
## 108 2020-07-23         deathIncrease     1074     1056
## 109 2020-07-25         deathIncrease     1009     1023
## 110 2020-07-26      positiveIncrease    60123    61000
## 111 2020-07-30         deathIncrease     1245     1259
## 112 2020-07-31         deathIncrease     1329     1312
## 113 2020-08-01      positiveIncrease    60245    61101
## 114 2020-08-02      positiveIncrease    52737    46812
## 115 2020-08-03      positiveIncrease    43122    49713
## 116 2020-08-06         deathIncrease     1237     1251
## 117 2020-08-08      positiveIncrease    53083    53712
## 118 2020-08-14      positiveIncrease    57093    55636
## 119 2020-08-16      positiveIncrease    41782    42487
## 120 2020-08-20         deathIncrease     1122     1134
## 121 2020-08-22      positiveIncrease    45722    46236
## 122 2020-08-24      positiveIncrease    34249    34643
## 123 2020-08-29      positiveIncrease    43962    44501
## 124 2020-08-31         deathIncrease      377      366
## 125 2020-09-02      positiveIncrease    30216    30603
## 126 2020-09-07      positiveIncrease    28142    28682
## 127 2020-09-08         deathIncrease      350      358
## 128 2020-09-10         deathIncrease     1156     1170
## 129 2020-09-12         deathIncrease      821      810
## 130 2020-09-15      positiveIncrease    34945    35445
## 131 2020-09-16         deathIncrease     1184     1200
## 132 2020-09-17         deathIncrease      878      863
## 133 2020-09-19      positiveIncrease    44906    45564
## 134 2020-09-19         deathIncrease      751      740
## 135 2020-09-20      positiveIncrease    35504    36295
## 136 2020-09-24         deathIncrease      933      921
## 137 2020-09-27      positiveIncrease    34983    35454
## 138 2020-09-28      positiveIncrease    35362    36524
## 139 2020-09-28         deathIncrease      245      257
## 140 2020-10-02         deathIncrease      844      835
## 141 2020-10-04      positiveIncrease    37982    38439
## 142 2020-10-04         deathIncrease      373      363
## 143 2020-10-06         deathIncrease      622      634
## 144 2020-10-09         deathIncrease      905      893
## 145 2020-10-10         deathIncrease      690      665
## 146 2020-10-11      positiveIncrease    46268    46946
## 147 2020-10-12      positiveIncrease    42645    43124
## 148 2020-10-13         deathIncrease      724      690
## 149 2020-10-14         deathIncrease      798      811
## 150 2020-10-15         deathIncrease      928      951
## 151 2020-10-16         deathIncrease      894      877
## 152 2020-10-17      positiveIncrease    57355    57943
## 153 2020-10-18      positiveIncrease    48280    48922
## 154 2020-10-18         deathIncrease      402      393
## 155 2020-10-19         deathIncrease      451      456
## 156 2020-10-21      positiveIncrease    60980    58606
## 157 2020-10-22      positiveIncrease    73003    75248
## 158 2020-10-22         deathIncrease     1126     1143
## 159 2020-10-23         deathIncrease      941      916
## 160 2020-10-24         deathIncrease      897      885
## Joining, by = c("date", "name")
## Warning: Removed 51 row(s) containing missing values (geom_path).
## 
## 
## *** Difference of at least 5 and difference is at least 1%:
## Joining, by = c("state", "name")
##    state                  name newValue oldValue
## 1     AK      positiveIncrease    12523    13535
## 2     CO      positiveIncrease    93398    91570
## 3     CO         deathIncrease     2218     2076
## 4     FL      positiveIncrease   766305   776249
## 5     ND         deathIncrease      453      345
## 6     NM      positiveIncrease    41040    40168
## 7     NM hospitalizedCurrently    27399    27120
## 8     PR      positiveIncrease    31067    61275
## 9     RI      positiveIncrease    30581    30116
## 10    WA hospitalizedCurrently    92643    69716
## Rows: 16,082
## Columns: 55
## $ date                        <date> 2020-12-14, 2020-12-14, 2020-12-14, 20...
## $ state                       <chr> "AK", "AL", "AR", "AS", "AZ", "CA", "CO...
## $ positive                    <dbl> 40160, 297895, 187507, 0, 420248, 15850...
## $ probableCases               <dbl> NA, 53133, 26701, NA, 15954, NA, 11826,...
## $ negative                    <dbl> 1107400, 1478907, 1705843, 2140, 212692...
## $ pending                     <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
## $ totalTestResultsSource      <chr> "totalTestsViral", "totalTestsPeopleVir...
## $ totalTestResults            <dbl> 1147560, 1723669, 1866199, 2140, 434469...
## $ hospitalizedCurrently       <dbl> 140, 2286, 1050, NA, 3677, 14578, 1585,...
## $ hospitalizedCumulative      <dbl> 889, 28913, 9991, NA, 31142, NA, 16174,...
## $ inIcuCurrently              <dbl> NA, NA, 372, NA, 829, 3078, NA, NA, 63,...
## $ inIcuCumulative             <dbl> NA, 2363, NA, NA, NA, NA, NA, NA, NA, N...
## $ onVentilatorCurrently       <dbl> 12, NA, 180, NA, 542, NA, NA, NA, 34, N...
## $ onVentilatorCumulative      <dbl> NA, 1353, 1095, NA, NA, NA, NA, NA, NA,...
## $ recovered                   <dbl> 7165, 174805, 163351, NA, 62118, NA, 15...
## $ dataQualityGrade            <chr> "A", "A", "A+", "D", "A+", "B", "A", "C...
## $ lastUpdateEt                <chr> "12/14/2020 03:59", "12/14/2020 11:00",...
## $ dateModified                <dttm> 2020-12-14 03:59:00, 2020-12-14 11:00:...
## $ checkTimeEt                 <chr> "12/13 22:59", "12/14 06:00", "12/13 19...
## $ death                       <dbl> 176, 4102, 2990, 0, 7358, 21046, 3969, ...
## $ hospitalized                <dbl> 889, 28913, 9991, NA, 31142, NA, 16174,...
## $ dateChecked                 <dttm> 2020-12-14 03:59:00, 2020-12-14 11:00:...
## $ totalTestsViral             <dbl> 1147560, NA, 1866199, 2140, 4344693, 27...
## $ positiveTestsViral          <dbl> 47796, NA, NA, NA, NA, NA, NA, NA, NA, ...
## $ negativeTestsViral          <dbl> 1098479, NA, 1705843, NA, NA, NA, NA, N...
## $ positiveCasesViral          <dbl> NA, 244762, 160356, 0, 404294, 1585044,...
## $ deathConfirmed              <dbl> 176, 3624, 2656, NA, 6782, NA, 3398, 43...
## $ deathProbable               <dbl> NA, 478, 334, NA, 576, NA, 571, 1047, N...
## $ totalTestEncountersViral    <dbl> NA, NA, NA, NA, NA, NA, 3809889, NA, 78...
## $ totalTestsPeopleViral       <dbl> NA, 1723669, NA, NA, 2531222, NA, 19688...
## $ totalTestsAntibody          <dbl> NA, NA, NA, NA, 377958, NA, 235882, NA,...
## $ positiveTestsAntibody       <dbl> NA, NA, NA, NA, NA, NA, 22560, NA, NA, ...
## $ negativeTestsAntibody       <dbl> NA, NA, NA, NA, NA, NA, 212548, NA, NA,...
## $ totalTestsPeopleAntibody    <dbl> NA, 78262, NA, NA, NA, NA, NA, NA, NA, ...
## $ positiveTestsPeopleAntibody <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
## $ negativeTestsPeopleAntibody <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
## $ totalTestsPeopleAntigen     <dbl> NA, NA, 179158, NA, NA, NA, NA, NA, NA,...
## $ positiveTestsPeopleAntigen  <dbl> NA, NA, 32616, NA, NA, NA, NA, NA, NA, ...
## $ totalTestsAntigen           <dbl> NA, NA, 21856, NA, NA, NA, NA, 49816, N...
## $ positiveTestsAntigen        <dbl> NA, NA, 3300, NA, NA, NA, NA, NA, NA, N...
## $ fips                        <chr> "02", "01", "05", "60", "04", "06", "08...
## $ positiveIncrease            <dbl> 422, 2264, 1805, 0, 11806, 33278, 2911,...
## $ negativeIncrease            <dbl> 4040, 20347, 9262, 0, 7223, 323174, 103...
## $ total                       <dbl> 1147560, 1776802, 1893350, 2140, 254717...
## $ totalTestResultsIncrease    <dbl> 4462, 27230, 10495, 0, 38805, 356452, 3...
## $ posNeg                      <dbl> 1147560, 1776802, 1893350, 2140, 254717...
## $ deathIncrease               <dbl> 0, 0, 45, 0, 1, 77, 11, 81, 1, 0, 138, ...
## $ hospitalizedIncrease        <dbl> 3, 767, 64, 0, 193, 0, 48, 0, 0, 0, 142...
## $ hash                        <chr> "58baf833f72d7115b62a7e4dd1ab3545263286...
## $ commercialScore             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ negativeRegularScore        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ negativeScore               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ positiveScore               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ score                       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ grade                       <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
## 
## 
## Control totals - note that validState other than TRUE will be discarded
## 
## # A tibble: 2 x 6
##   validState    cases deaths  hosp     tests     n
##   <lgl>         <dbl>  <dbl> <dbl>     <dbl> <dbl>
## 1 FALSE         72033   1426    NA    510886  1370
## 2 TRUE       16267270 290978    NA 220582479 14712
## Rows: 14,712
## Columns: 6
## $ date   <date> 2020-12-14, 2020-12-14, 2020-12-14, 2020-12-14, 2020-12-14,...
## $ state  <chr> "AK", "AL", "AR", "AZ", "CA", "CO", "CT", "DC", "DE", "FL", ...
## $ cases  <dbl> 422, 2264, 1805, 11806, 33278, 2911, 7231, 164, 997, 8343, 3...
## $ deaths <dbl> 0, 0, 45, 1, 77, 11, 81, 1, 0, 138, 28, 0, 60, 6, 116, 35, 3...
## $ hosp   <dbl> 140, 2286, 1050, 3677, 14578, 1585, 1243, 239, 373, 4932, 33...
## $ tests  <dbl> 4462, 27230, 10495, 38805, 356452, 36588, 119244, 4714, 9931...
## Rows: 14,712
## Columns: 14
## $ date   <date> 2020-01-22, 2020-01-22, 2020-01-23, 2020-01-23, 2020-01-24,...
## $ state  <chr> "MA", "WA", "MA", "WA", "MA", "WA", "MA", "WA", "MA", "WA", ...
## $ cases  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ deaths <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ hosp   <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
## $ tests  <dbl> 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, ...
## $ cpm    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ dpm    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ hpm    <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
## $ tpm    <dbl> 0.0000000, 0.0000000, 0.1471796, 0.0000000, 0.0000000, 0.000...
## $ cpm7   <dbl> NA, NA, NA, NA, NA, NA, 0, 0, 0, 0, 0, 0, 0, 0, NA, 0, 0, NA...
## $ dpm7   <dbl> NA, NA, NA, NA, NA, NA, 0, 0, 0, 0, 0, 0, 0, 0, NA, 0, 0, NA...
## $ hpm7   <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
## $ tpm7   <dbl> NA, NA, NA, NA, NA, NA, 0.04205130, 0.00000000, 0.06307695, ...
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` regrouping output by 'date', 'cluster' (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)

## 
## Recency is defined as 2020-11-15 through current
## 
## Recency is defined as 2020-11-15 through current

## Warning: Removed 4 row(s) containing missing values (geom_path).

## Warning: Removed 4 row(s) containing missing values (geom_path).
## `summarise()` regrouping output by 'state', 'cluster', 'date' (override with `.groups` argument)

## `summarise()` ungrouping output (override with `.groups` argument)

## `summarise()` ungrouping output (override with `.groups` argument)

## `summarise()` ungrouping output (override with `.groups` argument)

saveToRDS(ctp_census_201214, ovrWriteError=FALSE)
## 
## File already exists: ./RInputFiles/Coronavirus/ctp_census_201214.RDS 
## 
## Not replacing the existing file since ovrWrite=FALSE
## NULL

Patterns are largely as expected. The US census divisions do not always align well with the timing of coronavirus impact. This is particularly notable in “South Atlantic” where a portion of the states had early and heavy disease while another portion had their primary impacts several months later.